Browse Source

Remove unused bin/x86_64 directory hierarchy

pull/53/merge
Rob Armstrong 2 months ago
parent
commit
ab68d58d59
  1. 36
      bin/x86_64/linux/release/APM_BlackScholes.txt
  2. 34
      bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt
  3. 37
      bin/x86_64/linux/release/APM_FDTD3d.txt
  4. 9
      bin/x86_64/linux/release/APM_HSOpticalFlow.txt
  5. 14
      bin/x86_64/linux/release/APM_MC_EstimatePiInlineP.txt
  6. 14
      bin/x86_64/linux/release/APM_MC_EstimatePiInlineQ.txt
  7. 14
      bin/x86_64/linux/release/APM_MC_EstimatePiP.txt
  8. 14
      bin/x86_64/linux/release/APM_MC_EstimatePiQ.txt
  9. 13
      bin/x86_64/linux/release/APM_MC_SingleAsianOptionP.txt
  10. 19
      bin/x86_64/linux/release/APM_MersenneTwisterGP11213.txt
  11. 29
      bin/x86_64/linux/release/APM_MonteCarloMultiGPU.txt
  12. 10
      bin/x86_64/linux/release/APM_NV12toBGRandResize.txt
  13. 19
      bin/x86_64/linux/release/APM_SobolQRNG.txt
  14. 6
      bin/x86_64/linux/release/APM_StreamPriorities.txt
  15. 17
      bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt
  16. 44
      bin/x86_64/linux/release/APM_UnifiedMemoryStreams.txt
  17. 51
      bin/x86_64/linux/release/APM_alignedTypes.txt
  18. 7
      bin/x86_64/linux/release/APM_asyncAPI.txt
  19. 24
      bin/x86_64/linux/release/APM_bandwidthTest.txt
  20. 59
      bin/x86_64/linux/release/APM_batchCUBLAS.txt
  21. 21
      bin/x86_64/linux/release/APM_batchedLabelMarkersAndLabelCompressionNPP.txt
  22. 11
      bin/x86_64/linux/release/APM_bf16TensorCoreGemm.txt
  23. 8
      bin/x86_64/linux/release/APM_binaryPartitionCG.txt
  24. 22
      bin/x86_64/linux/release/APM_binomialOptions.txt
  25. 23
      bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt
  26. 4
      bin/x86_64/linux/release/APM_c++11_cuda.txt
  27. 6
      bin/x86_64/linux/release/APM_cdpAdvancedQuicksort.txt
  28. 2
      bin/x86_64/linux/release/APM_cdpBezierTessellation.txt
  29. 5
      bin/x86_64/linux/release/APM_cdpQuadtree.txt
  30. 23
      bin/x86_64/linux/release/APM_cdpSimplePrint.txt
  31. 6
      bin/x86_64/linux/release/APM_cdpSimpleQuicksort.txt
  32. 4
      bin/x86_64/linux/release/APM_clock.txt
  33. 5
      bin/x86_64/linux/release/APM_clock_nvrtc.txt
  34. 8
      bin/x86_64/linux/release/APM_concurrentKernels.txt
  35. 13
      bin/x86_64/linux/release/APM_conjugateGradient.txt
  36. 13
      bin/x86_64/linux/release/APM_conjugateGradientCudaGraphs.txt
  37. 8
      bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt
  38. 4
      bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt
  39. 18
      bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt
  40. 16
      bin/x86_64/linux/release/APM_conjugateGradientUM.txt
  41. 41
      bin/x86_64/linux/release/APM_convolutionFFT2D.txt
  42. 21
      bin/x86_64/linux/release/APM_convolutionSeparable.txt
  43. 17
      bin/x86_64/linux/release/APM_convolutionTexture.txt
  44. 4
      bin/x86_64/linux/release/APM_cppIntegration.txt
  45. 30
      bin/x86_64/linux/release/APM_cppOverload.txt
  46. 0
      bin/x86_64/linux/release/APM_cuHook.txt
  47. 15
      bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt
  48. 58
      bin/x86_64/linux/release/APM_cuSolverRf.txt
  49. 38
      bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt
  50. 24
      bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt
  51. 25
      bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt
  52. 9
      bin/x86_64/linux/release/APM_cudaCompressibleMemory.txt
  53. 8
      bin/x86_64/linux/release/APM_cudaOpenMP.txt
  54. 11
      bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt
  55. 32
      bin/x86_64/linux/release/APM_dct8x8.txt
  56. 46
      bin/x86_64/linux/release/APM_deviceQuery.txt
  57. 43
      bin/x86_64/linux/release/APM_deviceQueryDrv.txt
  58. 11
      bin/x86_64/linux/release/APM_dmmaTensorCoreGemm.txt
  59. 11
      bin/x86_64/linux/release/APM_dwtHaar1D.txt
  60. 16
      bin/x86_64/linux/release/APM_dxtc.txt
  61. 13
      bin/x86_64/linux/release/APM_eigenvalues.txt
  62. 17
      bin/x86_64/linux/release/APM_fastWalshTransform.txt
  63. 5
      bin/x86_64/linux/release/APM_fp16ScalarProduct.txt
  64. 11
      bin/x86_64/linux/release/APM_globalToShmemAsyncCopy.txt
  65. 89
      bin/x86_64/linux/release/APM_graphMemoryFootprint.txt
  66. 34
      bin/x86_64/linux/release/APM_graphMemoryNodes.txt
  67. 48
      bin/x86_64/linux/release/APM_histogram.txt
  68. 11
      bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt
  69. 4
      bin/x86_64/linux/release/APM_inlinePTX.txt
  70. 5
      bin/x86_64/linux/release/APM_inlinePTX_nvrtc.txt
  71. 15
      bin/x86_64/linux/release/APM_interval.txt
  72. 9
      bin/x86_64/linux/release/APM_jacobiCudaGraphs.txt
  73. 0
      bin/x86_64/linux/release/APM_libcuhook.so.1.txt
  74. 7
      bin/x86_64/linux/release/APM_lineOfSight.txt
  75. 10
      bin/x86_64/linux/release/APM_matrixMul.txt
  76. 12
      bin/x86_64/linux/release/APM_matrixMulCUBLAS.txt
  77. 11
      bin/x86_64/linux/release/APM_matrixMulDrv.txt
  78. 6
      bin/x86_64/linux/release/APM_matrixMulDynlinkJIT.txt
  79. 9
      bin/x86_64/linux/release/APM_matrixMul_nvrtc.txt
  80. 6
      bin/x86_64/linux/release/APM_memMapIPCDrv.txt
  81. 17
      bin/x86_64/linux/release/APM_mergeSort.txt
  82. 11
      bin/x86_64/linux/release/APM_newdelete.txt
  83. 56
      bin/x86_64/linux/release/APM_nvJPEG.txt
  84. 62
      bin/x86_64/linux/release/APM_nvJPEG_encoder.txt
  85. 35
      bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt
  86. 15
      bin/x86_64/linux/release/APM_ptxjit.txt
  87. 24
      bin/x86_64/linux/release/APM_quasirandomGenerator.txt
  88. 27
      bin/x86_64/linux/release/APM_quasirandomGenerator_nvrtc.txt
  89. 9
      bin/x86_64/linux/release/APM_radixSortThrust.txt
  90. 18
      bin/x86_64/linux/release/APM_reduction.txt
  91. 14
      bin/x86_64/linux/release/APM_reductionMultiBlockCG.txt
  92. 19
      bin/x86_64/linux/release/APM_scalarProd.txt
  93. 138
      bin/x86_64/linux/release/APM_scan.txt
  94. 6
      bin/x86_64/linux/release/APM_segmentationTreeThrust.txt
  95. 24
      bin/x86_64/linux/release/APM_shfl_scan.txt
  96. 6
      bin/x86_64/linux/release/APM_simpleAWBarrier.txt
  97. 16
      bin/x86_64/linux/release/APM_simpleAssert.txt
  98. 12
      bin/x86_64/linux/release/APM_simpleAssert_nvrtc.txt
  99. 5
      bin/x86_64/linux/release/APM_simpleAtomicIntrinsics.txt
  100. 6
      bin/x86_64/linux/release/APM_simpleAtomicIntrinsics_nvrtc.txt
  101. Some files were not shown because too many files have changed in this diff Show More

36
bin/x86_64/linux/release/APM_BlackScholes.txt

@ -1,36 +0,0 @@ @@ -1,36 +0,0 @@
[./BlackScholes] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Initializing data...
...allocating CPU memory for options.
...allocating GPU memory for options.
...generating input data in CPU mem.
...copying input data to GPU mem.
Data init done.
Executing Black-Scholes GPU kernel (512 iterations)...
Options count : 8000000
BlackScholesGPU() time : 0.048059 msec
Effective memory bandwidth: 1664.634581 GB/s
Gigaoptions per second : 166.463458
BlackScholes, Throughput = 166.4635 GOptions/s, Time = 0.00005 s, Size = 8000000 options, NumDevsUsed = 1, Workgroup = 128
Reading back GPU results...
Checking the results...
...running CPU calculations.
Comparing the results...
L1 norm: 1.741792E-07
Max absolute error: 1.192093E-05
Shutting down...
...releasing GPU memory.
...releasing CPU memory.
Shutdown done.
[BlackScholes] - Test Summary
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
Test passed

34
bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt

@ -1,34 +0,0 @@ @@ -1,34 +0,0 @@
[./BlackScholes_nvrtc] - Starting...
Initializing data...
...allocating CPU memory for options.
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
...allocating GPU memory for options.
...generating input data in CPU mem.
...copying input data to GPU mem.
Data init done.
Executing Black-Scholes GPU kernel (512 iterations)...
Options count : 8000000
BlackScholesGPU() time : 0.047896 msec
Effective memory bandwidth: 1670.268678 GB/s
Gigaoptions per second : 167.026868
BlackScholes, Throughput = 167.0269 GOptions/s, Time = 0.00005 s, Size = 8000000 options, NumDevsUsed = 1, Workgroup = 128
Reading back GPU results...
Checking the results...
...running CPU calculations.
Comparing the results...
L1 norm: 1.741792E-07
Max absolute error: 1.192093E-05
Shutting down...
...releasing GPU memory.
...releasing CPU memory.
Shutdown done.
[./BlackScholes_nvrtc] - Test Summary
Test passed

37
bin/x86_64/linux/release/APM_FDTD3d.txt

@ -1,37 +0,0 @@ @@ -1,37 +0,0 @@
./FDTD3d Starting...
Set-up, based upon target device GMEM size...
getTargetDeviceGlobalMemSize
cudaGetDeviceCount
GPU Device 0: "Hopper" with compute capability 9.0
cudaGetDeviceProperties
generateRandomData
FDTD on 376 x 376 x 376 volume with symmetric filter radius 4 for 5 timesteps...
fdtdReference...
calloc intermediate
Host FDTD loop
t = 0
t = 1
t = 2
t = 3
t = 4
fdtdReference complete
fdtdGPU...
GPU Device 0: "Hopper" with compute capability 9.0
set block size to 32x16
set grid size to 12x24
GPU FDTD loop
t = 0 launch kernel
t = 1 launch kernel
t = 2 launch kernel
t = 3 launch kernel
t = 4 launch kernel
fdtdGPU complete
CompareData (tolerance 0.000100)...

9
bin/x86_64/linux/release/APM_HSOpticalFlow.txt

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
HSOpticalFlow Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Loading "frame10.ppm" ...
Loading "frame11.ppm" ...
Computing optical flow on CPU...
Computing optical flow on GPU...
L1 error : 0.044308

14
bin/x86_64/linux/release/APM_MC_EstimatePiInlineP.txt

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
Monte Carlo Estimate Pi (with inline PRNG)
==========================================
Estimating Pi on GPU (NVIDIA H100 PCIe)
Precision: single
Number of sims: 100000
Tolerance: 1.000000e-02
GPU result: 3.141480e+00
Expected: 3.141593e+00
Absolute error: 1.127720e-04
Relative error: 3.589644e-05
MonteCarloEstimatePiInlineP, Performance = 746954.33 sims/s, Time = 133.88(ms), NumDevsUsed = 1, Blocksize = 128

14
bin/x86_64/linux/release/APM_MC_EstimatePiInlineQ.txt

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
Monte Carlo Estimate Pi (with inline QRNG)
==========================================
Estimating Pi on GPU (NVIDIA H100 PCIe)
Precision: single
Number of sims: 100000
Tolerance: 1.000000e-02
GPU result: 3.141840e+00
Expected: 3.141593e+00
Absolute error: 2.472401e-04
Relative error: 7.869895e-05
MonteCarloEstimatePiInlineQ, Performance = 677644.44 sims/s, Time = 147.57(ms), NumDevsUsed = 1, Blocksize = 128

14
bin/x86_64/linux/release/APM_MC_EstimatePiP.txt

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
Monte Carlo Estimate Pi (with batch PRNG)
=========================================
Estimating Pi on GPU (NVIDIA H100 PCIe)
Precision: single
Number of sims: 100000
Tolerance: 1.000000e-02
GPU result: 3.136320e+00
Expected: 3.141593e+00
Absolute error: 5.272627e-03
Relative error: 1.678329e-03
MonteCarloEstimatePiP, Performance = 652941.82 sims/s, Time = 153.15(ms), NumDevsUsed = 1, Blocksize = 128

14
bin/x86_64/linux/release/APM_MC_EstimatePiQ.txt

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
Monte Carlo Estimate Pi (with batch QRNG)
=========================================
Estimating Pi on GPU (NVIDIA H100 PCIe)
Precision: single
Number of sims: 100000
Tolerance: 1.000000e-02
GPU result: 3.141840e+00
Expected: 3.141593e+00
Absolute error: 2.472401e-04
Relative error: 7.869895e-05
MonteCarloEstimatePiQ, Performance = 821146.16 sims/s, Time = 121.78(ms), NumDevsUsed = 1, Blocksize = 128

13
bin/x86_64/linux/release/APM_MC_SingleAsianOptionP.txt

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
Monte Carlo Single Asian Option (with PRNG)
===========================================
Pricing option on GPU (NVIDIA H100 PCIe)
Precision: single
Number of sims: 100000
Spot | Strike | r | sigma | tenor | Call/Put | Value | Expected |
-----------|------------|------------|------------|------------|------------|------------|------------|
40 | 35 | 0.03 | 0.2 | 0.333333 | Call | 5.17083 | 5.16253 |
MonteCarloSingleAsianOptionP, Performance = 824402.28 sims/s, Time = 121.30(ms), NumDevsUsed = 1, Blocksize = 128

19
bin/x86_64/linux/release/APM_MersenneTwisterGP11213.txt

@ -1,19 +0,0 @@ @@ -1,19 +0,0 @@
./MersenneTwisterGP11213 Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Allocating data for 2400000 samples...
Seeding with 777 ...
Generating random numbers on GPU...
Reading back the results...
Generating random numbers on CPU...
Comparing CPU/GPU random numbers...
Max absolute error: 0.000000E+00
L1 norm: 0.000000E+00
MersenneTwisterGP11213, Throughput = 74.5342 GNumbers/s, Time = 0.00003 s, Size = 2400000 Numbers
Shutting down...

29
bin/x86_64/linux/release/APM_MonteCarloMultiGPU.txt

@ -1,29 +0,0 @@ @@ -1,29 +0,0 @@
./MonteCarloMultiGPU Starting...
Using single CPU thread for multiple GPUs
MonteCarloMultiGPU
==================
Parallelization method = streamed
Problem scaling = weak
Number of GPUs = 1
Total number of options = 8192
Number of paths = 262144
main(): generating input data...
main(): starting 1 host threads...
main(): GPU statistics, streamed
GPU Device #0: NVIDIA H100 PCIe
Options : 8192
Simulation paths: 262144
Total time (ms.): 5.516000
Note: This is elapsed time for all to compute.
Options per sec.: 1485134.210647
main(): comparing Monte Carlo and Black-Scholes results...
Shutting down...
Test Summary...
L1 norm : 4.869781E-04
Average reserve: 14.607882
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
Test passed

10
bin/x86_64/linux/release/APM_NV12toBGRandResize.txt

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
TEST#1:
CUDA resize nv12(1920x1080 --> 640x480), batch: 24, average time: 0.024 ms ==> 0.001 ms/frame
CUDA convert nv12(640x480) to bgr(640x480), batch: 24, average time: 0.061 ms ==> 0.003 ms/frame
TEST#2:
CUDA convert nv12(1920x1080) to bgr(1920x1080), batch: 24, average time: 0.405 ms ==> 0.017 ms/frame
CUDA resize bgr(1920x1080 --> 640x480), batch: 24, average time: 0.318 ms ==> 0.013 ms/frame

19
bin/x86_64/linux/release/APM_SobolQRNG.txt

@ -1,19 +0,0 @@ @@ -1,19 +0,0 @@
Sobol Quasi-Random Number Generator Starting...
> number of vectors = 100000
> number of dimensions = 100
GPU Device 0: "Hopper" with compute capability 9.0
Allocating CPU memory...
Allocating GPU memory...
Initializing direction numbers...
Copying direction numbers to device...
Executing QRNG on GPU...
Gsamples/s: 7.51315
Reading results from GPU...
Executing QRNG on CPU...
Gsamples/s: 0.232504
Checking results...
L1-Error: 0
Shutting down...

6
bin/x86_64/linux/release/APM_StreamPriorities.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
Starting [./StreamPriorities]...
GPU Device 0: "Hopper" with compute capability 9.0
CUDA stream priority range: LOW: 0 to HIGH: -5
elapsed time of kernels launched to LOW priority stream: 0.661 ms
elapsed time of kernels launched to HI priority stream: 0.523 ms

17
bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Running ........................................................
Overall Time For matrixMultiplyPerf
Printing Average of 20 measurements in (ms)
Size_KB UMhint UMhntAs UMeasy 0Copy MemCopy CpAsync CpHpglk CpPglAs
4 0.210 0.264 0.332 0.014 0.033 0.026 0.037 0.024
16 0.201 0.307 0.489 0.025 0.043 0.035 0.046 0.045
64 0.311 0.381 0.758 0.067 0.084 0.075 0.074 0.063
256 0.545 0.604 1.429 0.323 0.228 0.212 0.197 0.187
1024 1.551 1.444 2.436 1.902 0.831 0.784 0.714 0.728
4096 4.960 4.375 7.863 11.966 3.239 3.179 2.908 2.919
16384 18.911 17.022 29.696 77.375 13.796 13.757 12.874 12.862
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

44
bin/x86_64/linux/release/APM_UnifiedMemoryStreams.txt

@ -1,44 +0,0 @@ @@ -1,44 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Executing tasks on host / device
Task [0], thread [0] executing on device (569)
Task [1], thread [1] executing on device (904)
Task [2], thread [2] executing on device (529)
Task [3], thread [3] executing on device (600)
Task [4], thread [0] executing on device (975)
Task [5], thread [2] executing on device (995)
Task [6], thread [1] executing on device (576)
Task [7], thread [3] executing on device (700)
Task [8], thread [0] executing on device (716)
Task [9], thread [2] executing on device (358)
Task [10], thread [3] executing on device (941)
Task [11], thread [1] executing on device (403)
Task [12], thread [0] executing on host (97)
Task [13], thread [2] executing on device (451)
Task [14], thread [1] executing on device (789)
Task [15], thread [0] executing on device (810)
Task [16], thread [2] executing on device (807)
Task [17], thread [3] executing on device (756)
Task [18], thread [0] executing on device (509)
Task [19], thread [1] executing on device (252)
Task [20], thread [2] executing on device (515)
Task [21], thread [3] executing on device (676)
Task [22], thread [0] executing on device (948)
Task [23], thread [1] executing on device (944)
Task [24], thread [3] executing on device (974)
Task [25], thread [2] executing on device (513)
Task [26], thread [0] executing on device (207)
Task [27], thread [1] executing on device (509)
Task [28], thread [2] executing on device (344)
Task [29], thread [3] executing on device (198)
Task [30], thread [0] executing on device (223)
Task [31], thread [1] executing on device (382)
Task [32], thread [3] executing on device (980)
Task [33], thread [2] executing on device (519)
Task [34], thread [0] executing on host (92)
Task [35], thread [1] executing on device (677)
Task [36], thread [2] executing on device (769)
Task [37], thread [0] executing on device (199)
Task [38], thread [3] executing on device (845)
Task [39], thread [0] executing on device (844)
All Done!

51
bin/x86_64/linux/release/APM_alignedTypes.txt

@ -1,51 +0,0 @@ @@ -1,51 +0,0 @@
[./alignedTypes] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
[NVIDIA H100 PCIe] has 114 MP(s) x 128 (Cores/MP) = 14592 (Cores)
> Compute scaling value = 1.00
> Memory Size = 49999872
Allocating memory...
Generating host input data array...
Uploading input data to GPU memory...
Testing misaligned types...
uint8...
Avg. time: 1.298781 ms / Copy throughput: 35.853619 GB/s.
TEST OK
uint16...
Avg. time: 0.676656 ms / Copy throughput: 68.817823 GB/s.
TEST OK
RGBA8_misaligned...
Avg. time: 0.371437 ms / Copy throughput: 125.367015 GB/s.
TEST OK
LA32_misaligned...
Avg. time: 0.200531 ms / Copy throughput: 232.213238 GB/s.
TEST OK
RGB32_misaligned...
Avg. time: 0.154500 ms / Copy throughput: 301.398134 GB/s.
TEST OK
RGBA32_misaligned...
Avg. time: 0.124531 ms / Copy throughput: 373.930325 GB/s.
TEST OK
Testing aligned types...
RGBA8...
Avg. time: 0.364031 ms / Copy throughput: 127.917614 GB/s.
TEST OK
I32...
Avg. time: 0.363844 ms / Copy throughput: 127.983539 GB/s.
TEST OK
LA32...
Avg. time: 0.200750 ms / Copy throughput: 231.960205 GB/s.
TEST OK
RGB32...
Avg. time: 0.122375 ms / Copy throughput: 380.518985 GB/s.
TEST OK
RGBA32...
Avg. time: 0.122437 ms / Copy throughput: 380.324735 GB/s.
TEST OK
RGBA32_2...
Avg. time: 0.080563 ms / Copy throughput: 578.010964 GB/s.
TEST OK
[alignedTypes] -> Test Results: 0 Failures
Shutting down...
Test passed

7
bin/x86_64/linux/release/APM_asyncAPI.txt

@ -1,7 +0,0 @@ @@ -1,7 +0,0 @@
[./asyncAPI] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
CUDA device [NVIDIA H100 PCIe]
time spent executing by the GPU: 5.34
time spent by CPU in CUDA calls: 0.03
CPU executed 55200 iterations while waiting for GPU to finish

24
bin/x86_64/linux/release/APM_bandwidthTest.txt

@ -1,24 +0,0 @@ @@ -1,24 +0,0 @@
[CUDA Bandwidth Test] - Starting...
Running on...
Device 0: NVIDIA H100 PCIe
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 27.9
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 25.0
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 1421.4
Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

59
bin/x86_64/linux/release/APM_batchCUBLAS.txt

@ -1,59 +0,0 @@ @@ -1,59 +0,0 @@
batchCUBLAS Starting...
GPU Device 0: "Hopper" with compute capability 9.0
==== Running single kernels ====
Testing sgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbf800000, -1) beta= (0x40000000, 2)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00195909 sec GFLOPS=2.14095
@@@@ sgemm test OK
Testing dgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0x0000000000000000, 0) beta= (0x0000000000000000, 0)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00003910 sec GFLOPS=107.269
@@@@ dgemm test OK
==== Running N=10 without streams ====
Testing sgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbf800000, -1) beta= (0x00000000, 0)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00016713 sec GFLOPS=250.958
@@@@ sgemm test OK
Testing dgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbff0000000000000, -1) beta= (0x0000000000000000, 0)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00144100 sec GFLOPS=29.1069
@@@@ dgemm test OK
==== Running N=10 with streams ====
Testing sgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0x40000000, 2) beta= (0x40000000, 2)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00017214 sec GFLOPS=243.659
@@@@ sgemm test OK
Testing dgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbff0000000000000, -1) beta= (0x0000000000000000, 0)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00014997 sec GFLOPS=279.685
@@@@ dgemm test OK
==== Running N=10 batched ====
Testing sgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0x3f800000, 1) beta= (0xbf800000, -1)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00004101 sec GFLOPS=1022.8
@@@@ sgemm test OK
Testing dgemm
#### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbff0000000000000, -1) beta= (0x4000000000000000, 2)
#### args: lda=128 ldb=128 ldc=128
^^^^ elapsed = 0.00004506 sec GFLOPS=930.803
@@@@ dgemm test OK
Test Summary
0 error(s)

21
bin/x86_64/linux/release/APM_batchedLabelMarkersAndLabelCompressionNPP.txt

@ -1,21 +0,0 @@ @@ -1,21 +0,0 @@
NPP Library Version 12.0.0
CUDA Driver Version: 12.0
CUDA Runtime Version: 12.0
Input file load succeeded.
teapot_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, compressed label count is 155332.
Input file load succeeded.
CT_Skull_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, compressed label count is 414.
Input file load succeeded.
PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u succeeded, compressed label count is 3731.
Input file load succeeded.
PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u succeeded, compressed label count is 1224.
Input file load succeeded.
PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u succeeded, compressed label count is 1440.
teapot_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, compressed label count is 155332.
CT_Skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, compressed label count is 414.
PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u succeeded, compressed label count is 3731.
PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u succeeded, compressed label count is 1222.
PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u succeeded, compressed label count is 1447.

11
bin/x86_64/linux/release/APM_bf16TensorCoreGemm.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
Initializing...
GPU Device 0: "Hopper" with compute capability 9.0
M: 8192 (16 x 512)
N: 8192 (16 x 512)
K: 8192 (16 x 512)
Preparing data for GPU...
Required shared memory size: 72 Kb
Computing using high performance kernel = 0 - compute_bf16gemm_async_copy
Time: 9.149888 ms
TFLOPS: 120.17

8
bin/x86_64/linux/release/APM_binaryPartitionCG.txt

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Launching 228 blocks with 1024 threads...
Array size = 102400 Num of Odds = 50945 Sum of Odds = 1272565 Sum of Evens 1233938
...Done.

22
bin/x86_64/linux/release/APM_binomialOptions.txt

@ -1,22 +0,0 @@ @@ -1,22 +0,0 @@
[./binomialOptions] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Generating input data...
Running GPU binomial tree...
Options count : 1024
Time steps : 2048
binomialOptionsGPU() time: 2.081000 msec
Options per second : 492071.098457
Running CPU binomial tree...
Comparing the results...
GPU binomial vs. Black-Scholes
L1 norm: 2.220214E-04
CPU binomial vs. Black-Scholes
L1 norm: 2.220922E-04
CPU binomial vs. GPU binomial
L1 norm: 7.997008E-07
Shutting down...
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
Test passed

23
bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt

@ -1,23 +0,0 @@ @@ -1,23 +0,0 @@
[./binomialOptions_nvrtc] - Starting...
Generating input data...
Running GPU binomial tree...
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Options count : 1024
Time steps : 2048
binomialOptionsGPU() time: 3021.375000 msec
Options per second : 338.918539
Running CPU binomial tree...
Comparing the results...
GPU binomial vs. Black-Scholes
L1 norm: 2.216577E-04
CPU binomial vs. Black-Scholes
L1 norm: 9.435265E-05
CPU binomial vs. GPU binomial
L1 norm: 1.513570E-04
Shutting down...
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
Test passed

4
bin/x86_64/linux/release/APM_c++11_cuda.txt

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Read 3223503 byte corpus from ../../../../Samples/0_Introduction/c++11_cuda/warandpeace.txt
counted 107310 instances of 'x', 'y', 'z', or 'w' in "../../../../Samples/0_Introduction/c++11_cuda/warandpeace.txt"

6
bin/x86_64/linux/release/APM_cdpAdvancedQuicksort.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
GPU device NVIDIA H100 PCIe has compute capabilities (SM 9.0)
Running qsort on 1000000 elements with seed 0, on NVIDIA H100 PCIe
cdpAdvancedQuicksort PASSED
Sorted 1000000 elems in 5.015 ms (199.389 Melems/sec)

2
bin/x86_64/linux/release/APM_cdpBezierTessellation.txt

@ -1,2 +0,0 @@ @@ -1,2 +0,0 @@
Running on GPU 0 (NVIDIA H100 PCIe)
Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... Done!

5
bin/x86_64/linux/release/APM_cdpQuadtree.txt

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
GPU device NVIDIA H100 PCIe has compute capabilities (SM 9.0)
Launching CDP kernel to build the quadtree
Results: OK

23
bin/x86_64/linux/release/APM_cdpSimplePrint.txt

@ -1,23 +0,0 @@ @@ -1,23 +0,0 @@
starting Simple Print (CUDA Dynamic Parallelism)
GPU Device 0: "Hopper" with compute capability 9.0
***************************************************************************
The CPU launches 2 blocks of 2 threads each. On the device each thread will
launch 2 blocks of 2 threads each. The GPU we will do that recursively
until it reaches max_depth=2
In total 2+8=10 blocks are launched!!! (8 from the GPU)
***************************************************************************
Launching cdp_kernel() with CUDA Dynamic Parallelism:
BLOCK 1 launched by the host
BLOCK 0 launched by the host
| BLOCK 3 launched by thread 0 of block 1
| BLOCK 2 launched by thread 0 of block 1
| BLOCK 4 launched by thread 0 of block 0
| BLOCK 5 launched by thread 0 of block 0
| BLOCK 7 launched by thread 1 of block 0
| BLOCK 6 launched by thread 1 of block 0
| BLOCK 9 launched by thread 1 of block 1
| BLOCK 8 launched by thread 1 of block 1

6
bin/x86_64/linux/release/APM_cdpSimpleQuicksort.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Initializing data:
Running quicksort on 128 elements
Launching kernel on the GPU
Validating results: OK

4
bin/x86_64/linux/release/APM_clock.txt

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
CUDA Clock sample
GPU Device 0: "Hopper" with compute capability 9.0
Average clocks/block = 1904.875000

5
bin/x86_64/linux/release/APM_clock_nvrtc.txt

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
CUDA Clock sample
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Average clocks/block = 1839.750000

8
bin/x86_64/linux/release/APM_concurrentKernels.txt

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
[./concurrentKernels] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
> Detected Compute SM 9.0 hardware with 114 multi-processors
Expected time for serial execution of 8 kernels = 0.080s
Expected time for concurrent execution of 8 kernels = 0.010s
Measured time for sample = 0.010s
Test passed

13
bin/x86_64/linux/release/APM_conjugateGradient.txt

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
> GPU device has 114 Multi-Processors, SM 9.0 compute capabilities
iteration = 1, residual = 4.449882e+01
iteration = 2, residual = 3.245218e+00
iteration = 3, residual = 2.690220e-01
iteration = 4, residual = 2.307639e-02
iteration = 5, residual = 1.993140e-03
iteration = 6, residual = 1.846193e-04
iteration = 7, residual = 1.693379e-05
iteration = 8, residual = 1.600115e-06
Test Summary: Error amount = 0.000000

13
bin/x86_64/linux/release/APM_conjugateGradientCudaGraphs.txt

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
> GPU device has 114 Multi-Processors, SM 9.0 compute capabilities
iteration = 1, residual = 4.449882e+01
iteration = 2, residual = 3.245218e+00
iteration = 3, residual = 2.690220e-01
iteration = 4, residual = 2.307639e-02
iteration = 5, residual = 1.993140e-03
iteration = 6, residual = 1.846193e-04
iteration = 7, residual = 1.693379e-05
iteration = 8, residual = 1.600115e-06
Test Summary: Error amount = 0.000000

8
bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
Starting [conjugateGradientMultiBlockCG]...
GPU Device 0: "Hopper" with compute capability 9.0
> GPU device has 114 Multi-Processors, SM 9.0 compute capabilities
GPU Final, residual = 1.600115e-06, kernel execution time = 16.014656 ms
Test Summary: Error amount = 0.000000
&&&& conjugateGradientMultiBlockCG PASSED

4
bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
Starting [conjugateGradientMultiDeviceCG]...
GPU Device 0: "NVIDIA H100 PCIe" with compute capability 9.0
No two or more GPUs with same architecture capable of concurrentManagedAccess found.
Waiving the sample

18
bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt

@ -1,18 +0,0 @@ @@ -1,18 +0,0 @@
conjugateGradientPrecond starting...
GPU Device 0: "Hopper" with compute capability 9.0
GPU selected Device ID = 0
> GPU device has 114 Multi-Processors, SM 9.0 compute capabilities
laplace dimension = 128
Convergence of CG without preconditioning:
iteration = 564, residual = 9.174634e-13
Convergence Test: OK
Convergence of CG using ILU(0) preconditioning:
iteration = 188, residual = 9.084683e-13
Convergence Test: OK
Test Summary:
Counted total of 0 errors
qaerr1 = 0.000005 qaerr2 = 0.000003

16
bin/x86_64/linux/release/APM_conjugateGradientUM.txt

@ -1,16 +0,0 @@ @@ -1,16 +0,0 @@
Starting [conjugateGradientUM]...
GPU Device 0: "Hopper" with compute capability 9.0
> GPU device has 114 Multi-Processors, SM 9.0 compute capabilities
iteration = 1, residual = 4.449882e+01
iteration = 2, residual = 3.245218e+00
iteration = 3, residual = 2.690220e-01
iteration = 4, residual = 2.307639e-02
iteration = 5, residual = 1.993140e-03
iteration = 6, residual = 1.846193e-04
iteration = 7, residual = 1.693379e-05
iteration = 8, residual = 1.600115e-06
Final residual: 1.600115e-06
&&&& conjugateGradientUM PASSED
Test Summary: Error amount = 0.000000, result = SUCCESS

41
bin/x86_64/linux/release/APM_convolutionFFT2D.txt

@ -1,41 +0,0 @@ @@ -1,41 +0,0 @@
[./convolutionFFT2D] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Testing built-in R2C / C2R FFT-based convolution
...allocating memory
...generating random input data
...creating R2C & C2R FFT plans for 2048 x 2048
...uploading to GPU and padding convolution kernel and input data
...transforming convolution kernel
...running GPU FFT convolution: 33613.444604 MPix/s (0.119000 ms)
...reading back GPU convolution results
...running reference CPU convolution
...comparing the results: rel L2 = 9.395370E-08 (max delta = 1.208283E-06)
L2norm Error OK
...shutting down
Testing custom R2C / C2R FFT-based convolution
...allocating memory
...generating random input data
...creating C2C FFT plan for 2048 x 1024
...uploading to GPU and padding convolution kernel and input data
...transforming convolution kernel
...running GPU FFT convolution: 29197.081461 MPix/s (0.137000 ms)
...reading back GPU FFT results
...running reference CPU convolution
...comparing the results: rel L2 = 1.067915E-07 (max delta = 9.817303E-07)
L2norm Error OK
...shutting down
Testing updated custom R2C / C2R FFT-based convolution
...allocating memory
...generating random input data
...creating C2C FFT plan for 2048 x 1024
...uploading to GPU and padding convolution kernel and input data
...transforming convolution kernel
...running GPU FFT convolution: 39603.959017 MPix/s (0.101000 ms)
...reading back GPU FFT results
...running reference CPU convolution
...comparing the results: rel L2 = 1.065127E-07 (max delta = 9.817303E-07)
L2norm Error OK
...shutting down
Test Summary: 0 errors
Test passed

21
bin/x86_64/linux/release/APM_convolutionSeparable.txt

@ -1,21 +0,0 @@ @@ -1,21 +0,0 @@
[./convolutionSeparable] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Image Width x Height = 3072 x 3072
Allocating and initializing host arrays...
Allocating and initializing CUDA arrays...
Running GPU convolution (16 identical iterations)...
convolutionSeparable, Throughput = 74676.0329 MPixels/sec, Time = 0.00013 s, Size = 9437184 Pixels, NumDevsUsed = 1, Workgroup = 0
Reading back GPU results...
Checking the results...
...running convolutionRowCPU()
...running convolutionColumnCPU()
...comparing the results
...Relative L2 norm: 0.000000E+00
Shutting down...
Test passed

17
bin/x86_64/linux/release/APM_convolutionTexture.txt

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
[./convolutionTexture] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Initializing data...
Running GPU rows convolution (10 identical iterations)...
Average convolutionRowsGPU() time: 0.117200 msecs; //40261.023178 Mpix/s
Copying convolutionRowGPU() output back to the texture...
cudaMemcpyToArray() time: 0.067000 msecs; //70426.744514 Mpix/s
Running GPU columns convolution (10 iterations)
Average convolutionColumnsGPU() time: 0.116000 msecs; //40677.518412 Mpix/s
Reading back GPU results...
Checking the results...
...running convolutionRowsCPU()
...running convolutionColumnsCPU()
Relative L2 norm: 0.000000E+00
Shutting down...
Test passed

4
bin/x86_64/linux/release/APM_cppIntegration.txt

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Hello World.
Hello World.

30
bin/x86_64/linux/release/APM_cppOverload.txt

@ -1,30 +0,0 @@ @@ -1,30 +0,0 @@
C++ Function Overloading starting...
Device Count: 1
GPU Device 0: "Hopper" with compute capability 9.0
Shared Size: 1024
Constant Size: 0
Local Size: 0
Max Threads Per Block: 1024
Number of Registers: 12
PTX Version: 90
Binary Version: 90
simple_kernel(const int *pIn, int *pOut, int a) PASSED
Shared Size: 2048
Constant Size: 0
Local Size: 0
Max Threads Per Block: 1024
Number of Registers: 14
PTX Version: 90
Binary Version: 90
simple_kernel(const int2 *pIn, int *pOut, int a) PASSED
Shared Size: 2048
Constant Size: 0
Local Size: 0
Max Threads Per Block: 1024
Number of Registers: 14
PTX Version: 90
Binary Version: 90
simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

0
bin/x86_64/linux/release/APM_cuHook.txt

15
bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt

@ -1,15 +0,0 @@ @@ -1,15 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
step 1: read matrix market format
Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/gr_900_900_crg.mtx]
sparse matrix A is 900 x 900 with 7744 nonzeros, base=1
step 2: convert CSR(A) to dense matrix
step 3: set right hand side vector (b) to 1
step 4: prepare data on device
step 5: solve A*x = b
timing: cholesky = 0.000789 sec
step 6: evaluate residual
|b - A*x| = 1.278977E-13
|A| = 1.600000E+01
|x| = 2.357708E+01
|b - A*x|/(|A|*|x|) = 3.390413E-16

58
bin/x86_64/linux/release/APM_cuSolverRf.txt

@ -1,58 +0,0 @@ @@ -1,58 +0,0 @@
step 1.1: preparation
step 1.1: read matrix market format
GPU Device 0: "Hopper" with compute capability 9.0
Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverRf/lap2D_5pt_n100.mtx]
WARNING: cusolverRf only works for base-0
sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=0
step 1.2: set right hand side vector (b) to 1
step 2: reorder the matrix to reduce zero fill-in
Q = symrcm(A) or Q = symamd(A)
step 3: B = Q*A*Q^T
step 4: solve A*x = b by LU(B) in cusolverSp
step 4.1: create opaque info structure
step 4.2: analyze LU(B) to know structure of Q and R, and upper bound for nnz(L+U)
step 4.3: workspace for LU(B)
step 4.4: compute Ppivot*B = L*U
step 4.5: check if the matrix is singular
step 4.6: solve A*x = b
i.e. solve B*(Qx) = Q*b
step 4.7: evaluate residual r = b - A*x (result on CPU)
(CPU) |b - A*x| = 4.547474E-12
(CPU) |A| = 8.000000E+00
(CPU) |x| = 7.513384E+02
(CPU) |b - A*x|/(|A|*|x|) = 7.565621E-16
step 5: extract P, Q, L and U from P*B*Q^T = L*U
L has implicit unit diagonal
nnzL = 671550, nnzU = 681550
step 6: form P*A*Q^T = L*U
step 6.1: P = Plu*Qreroder
step 6.2: Q = Qlu*Qreorder
step 7: create cusolverRf handle
step 8: set parameters for cusolverRf
step 9: assemble P*A*Q = L*U
step 10: analyze to extract parallelism
step 11: import A to cusolverRf
step 12: refactorization
step 13: solve A*x = b
step 14: evaluate residual r = b - A*x (result on GPU)
(GPU) |b - A*x| = 4.320100E-12
(GPU) |A| = 8.000000E+00
(GPU) |x| = 7.513384E+02
(GPU) |b - A*x|/(|A|*|x|) = 7.187340E-16
===== statistics
nnz(A) = 49600, nnz(L+U) = 1353100, zero fill-in ratio = 27.280242
===== timing profile
reorder A : 0.003304 sec
B = Q*A*Q^T : 0.000761 sec
cusolverSp LU analysis: 0.000188 sec
cusolverSp LU factor : 0.069354 sec
cusolverSp LU solve : 0.001780 sec
cusolverSp LU extract : 0.005654 sec
cusolverRf assemble : 0.002426 sec
cusolverRf reset : 0.000021 sec
cusolverRf refactor : 0.097122 sec
cusolverRf solve : 0.123813 sec

38
bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt

@ -1,38 +0,0 @@ @@ -1,38 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/lap2D_5pt_n100.mtx]
step 1: read matrix market format
sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=1
step 2: reorder the matrix A to minimize zero fill-in
if the user choose a reordering by -P=symrcm, -P=symamd or -P=metis
step 2.1: no reordering is chosen, Q = 0:n-1
step 2.2: B = A(Q,Q)
step 3: b(j) = 1 + j/n
step 4: prepare data on device
step 5: solve A*x = b on CPU
step 6: evaluate residual r = b - A*x (result on CPU)
(CPU) |b - A*x| = 5.393685E-12
(CPU) |A| = 8.000000E+00
(CPU) |x| = 1.136492E+03
(CPU) |b| = 1.999900E+00
(CPU) |b - A*x|/(|A|*|x| + |b|) = 5.931079E-16
step 7: solve A*x = b on GPU
step 8: evaluate residual r = b - A*x (result on GPU)
(GPU) |b - A*x| = 1.970424E-12
(GPU) |A| = 8.000000E+00
(GPU) |x| = 1.136492E+03
(GPU) |b| = 1.999900E+00
(GPU) |b - A*x|/(|A|*|x| + |b|) = 2.166745E-16
timing chol: CPU = 0.097956 sec , GPU = 0.103812 sec
show last 10 elements of solution vector (GPU)
consistent result for different reordering and solver
x[9990] = 3.000016E+01
x[9991] = 2.807343E+01
x[9992] = 2.601354E+01
x[9993] = 2.380285E+01
x[9994] = 2.141866E+01
x[9995] = 1.883070E+01
x[9996] = 1.599668E+01
x[9997] = 1.285365E+01
x[9998] = 9.299423E+00
x[9999] = 5.147265E+00

24
bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt

@ -1,24 +0,0 @@ @@ -1,24 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/lap2D_5pt_n100.mtx]
step 1: read matrix market format
sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=1
step 2: create opaque info structure
step 3: analyze chol(A) to know structure of L
step 4: workspace for chol(A)
step 5: compute A = L*L^T
step 6: check if the matrix is singular
step 7: solve A*x = b
step 8: evaluate residual r = b - A*x (result on CPU)
(CPU) |b - A*x| = 3.637979E-12
(CPU) |A| = 8.000000E+00
(CPU) |x| = 7.513384E+02
(CPU) |b - A*x|/(|A|*|x|) = 6.052497E-16
step 9: create opaque info structure
step 10: analyze chol(A) to know structure of L
step 11: workspace for chol(A)
step 12: compute A = L*L^T
step 13: check if the matrix is singular
step 14: solve A*x = b
(GPU) |b - A*x| = 1.477929E-12
(GPU) |b - A*x|/(|A|*|x|) = 2.458827E-16

25
bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt

@ -1,25 +0,0 @@ @@ -1,25 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/lap2D_5pt_n32.mtx]
step 1: read matrix market format
sparse matrix A is 1024 x 1024 with 3008 nonzeros, base=1
step 2: create opaque info structure
step 3: analyze qr(A) to know structure of L
step 4: workspace for qr(A)
step 5: compute A = L*L^T
step 6: check if the matrix is singular
step 7: solve A*x = b
step 8: evaluate residual r = b - A*x (result on CPU)
(CPU) |b - A*x| = 5.329071E-15
(CPU) |A| = 6.000000E+00
(CPU) |x| = 5.000000E-01
(CPU) |b - A*x|/(|A|*|x|) = 1.776357E-15
step 9: create opaque info structure
step 10: analyze qr(A) to know structure of L
step 11: workspace for qr(A)
GPU buffer size = 3751424 bytes
step 12: compute A = L*L^T
step 13: check if the matrix is singular
step 14: solve A*x = b
(GPU) |b - A*x| = 4.218847E-15
(GPU) |b - A*x|/(|A|*|x|) = 1.406282E-15

9
bin/x86_64/linux/release/APM_cudaCompressibleMemory.txt

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Generic memory compression support is available
Running saxpy on 167772160 bytes of Compressible memory
Running saxpy with 228 blocks x 1024 threads = 0.084 ms 5.960 TB/s
Running saxpy on 167772160 bytes of Non-Compressible memory
Running saxpy with 228 blocks x 1024 threads = 0.345 ms 1.460 TB/s
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

8
bin/x86_64/linux/release/APM_cudaOpenMP.txt

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
./cudaOpenMP Starting...
number of host CPUs: 32
number of CUDA devices: 1
0: NVIDIA H100 PCIe
---------------------------
CPU thread 0 (of 1) uses CUDA device 0
---------------------------

11
bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
Initializing...
GPU Device 0: "Hopper" with compute capability 9.0
M: 4096 (16 x 256)
N: 4096 (16 x 256)
K: 4096 (16 x 256)
Preparing data for GPU...
Required shared memory size: 64 Kb
Computing... using high performance kernel compute_gemm
Time: 1.223904 ms
TFLOPS: 112.30

32
bin/x86_64/linux/release/APM_dct8x8.txt

@ -1,32 +0,0 @@ @@ -1,32 +0,0 @@
./dct8x8 Starting...
GPU Device 0: "Hopper" with compute capability 9.0
CUDA sample DCT/IDCT implementation
===================================
Loading test image: teapot512.bmp... [512 x 512]... Success
Running Gold 1 (CPU) version... Success
Running Gold 2 (CPU) version... Success
Running CUDA 1 (GPU) version... Success
Running CUDA 2 (GPU) version... 82435.220134 MPix/s //0.003180 ms
Success
Running CUDA short (GPU) version... Success
Dumping result to teapot512_gold1.bmp... Success
Dumping result to teapot512_gold2.bmp... Success
Dumping result to teapot512_cuda1.bmp... Success
Dumping result to teapot512_cuda2.bmp... Success
Dumping result to teapot512_cuda_short.bmp... Success
Processing time (CUDA 1) : 0.021800 ms
Processing time (CUDA 2) : 0.003180 ms
Processing time (CUDA short): 0.033000 ms
PSNR Original <---> CPU(Gold 1) : 32.527462
PSNR Original <---> CPU(Gold 2) : 32.527309
PSNR Original <---> GPU(CUDA 1) : 32.527184
PSNR Original <---> GPU(CUDA 2) : 32.527054
PSNR Original <---> GPU(CUDA short): 32.501888
PSNR CPU(Gold 1) <---> GPU(CUDA 1) : 62.845787
PSNR CPU(Gold 2) <---> GPU(CUDA 2) : 66.982300
PSNR CPU(Gold 2) <---> GPU(CUDA short): 40.958466
Test Summary...
Test passed

46
bin/x86_64/linux/release/APM_deviceQuery.txt

@ -1,46 +0,0 @@ @@ -1,46 +0,0 @@
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "NVIDIA H100 PCIe"
CUDA Driver Version / Runtime Version 12.0 / 12.0
CUDA Capability Major/Minor version number: 9.0
Total amount of global memory: 81082 MBytes (85021163520 bytes)
(114) Multiprocessors, (128) CUDA Cores/MP: 14592 CUDA Cores
GPU Max Clock rate: 1650 MHz (1.65 GHz)
Memory Clock rate: 1593 Mhz
Memory Bus Width: 5120-bit
L2 Cache Size: 52428800 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total shared memory per multiprocessor: 233472 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 3 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Enabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 193 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.0, CUDA Runtime Version = 12.0, NumDevs = 1
Result = PASS

43
bin/x86_64/linux/release/APM_deviceQueryDrv.txt

@ -1,43 +0,0 @@ @@ -1,43 +0,0 @@
./deviceQueryDrv Starting...
CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)
Device 0: "NVIDIA H100 PCIe"
CUDA Driver Version: 12.0
CUDA Capability Major/Minor version number: 9.0
Total amount of global memory: 81082 MBytes (85021163520 bytes)
(114) Multiprocessors, (128) CUDA Cores/MP: 14592 CUDA Cores
GPU Max Clock rate: 1650 MHz (1.65 GHz)
Memory Clock rate: 1593 Mhz
Memory Bus Width: 5120-bit
L2 Cache Size: 52428800 bytes
Max Texture Dimension Sizes 1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Texture alignment: 512 bytes
Maximum memory pitch: 2147483647 bytes
Concurrent copy and kernel execution: Yes with 3 copy engine(s)
Run time limit on kernels: No
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Concurrent kernel execution: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Enabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 193 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS

11
bin/x86_64/linux/release/APM_dmmaTensorCoreGemm.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
Initializing...
GPU Device 0: "Hopper" with compute capability 9.0
M: 8192 (8 x 1024)
N: 8192 (8 x 1024)
K: 4096 (4 x 1024)
Preparing data for GPU...
Required shared memory size: 68 Kb
Computing using high performance kernel = 0 - compute_dgemm_async_copy
Time: 30.856800 ms
FP64 TFLOPS: 17.82

11
bin/x86_64/linux/release/APM_dwtHaar1D.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
./dwtHaar1D Starting...
GPU Device 0: "Hopper" with compute capability 9.0
source file = "../../../../Samples/5_Domain_Specific/dwtHaar1D/data/signal.dat"
reference file = "result.dat"
gold file = "../../../../Samples/5_Domain_Specific/dwtHaar1D/data/regression.gold.dat"
Reading signal from "../../../../Samples/5_Domain_Specific/dwtHaar1D/data/signal.dat"
Writing result to "result.dat"
Reading reference result from "../../../../Samples/5_Domain_Specific/dwtHaar1D/data/regression.gold.dat"
Test success!

16
bin/x86_64/linux/release/APM_dxtc.txt

@ -1,16 +0,0 @@ @@ -1,16 +0,0 @@
./dxtc Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Image Loaded '../../../../Samples/5_Domain_Specific/dxtc/data/teapot512_std.ppm', 512 x 512 pixels
Running DXT Compression on 512 x 512 image...
16384 Blocks, 64 Threads per Block, 1048576 Threads in Grid...
dxtc, Throughput = 442.8108 MPixels/s, Time = 0.00059 s, Size = 262144 Pixels, NumDevsUsed = 1, Workgroup = 64
Checking accuracy...
RMS(reference, result) = 0.000000
Test passed

13
bin/x86_64/linux/release/APM_eigenvalues.txt

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
Starting eigenvalues
GPU Device 0: "Hopper" with compute capability 9.0
Matrix size: 2048 x 2048
Precision: 0.000010
Iterations to be timed: 100
Result filename: 'eigenvalues.dat'
Gerschgorin interval: -2.894310 / 2.923303
Average time step 1: 1.032310 ms
Average time step 2, one intervals: 1.228451 ms
Average time step 2, mult intervals: 2.694728 ms
Average time TOTAL: 4.970180 ms
Test Succeeded!

17
bin/x86_64/linux/release/APM_fastWalshTransform.txt

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
./fastWalshTransform Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Initializing data...
...allocating CPU memory
...allocating GPU memory
...generating data
Data length: 8388608; kernel length: 128
Running GPU dyadic convolution using Fast Walsh Transform...
GPU time: 0.751000 ms; GOP/s: 385.362158
Reading back GPU results...
Running straightforward CPU dyadic convolution...
Comparing the results...
Shutting down...
L2 norm: 1.021579E-07
Test passed

5
bin/x86_64/linux/release/APM_fp16ScalarProduct.txt

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Result native operators : 644622.000000
Result intrinsics : 644622.000000
&&&& fp16ScalarProduct PASSED

11
bin/x86_64/linux/release/APM_globalToShmemAsyncCopy.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
[globalToShmemAsyncCopy] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
MatrixA(1280,1280), MatrixB(1280,1280)
Running kernel = 0 - AsyncCopyMultiStageLargeChunk
Computing result using CUDA Kernel...
done
Performance= 5289.33 GFlop/s, Time= 0.793 msec, Size= 4194304000 Ops, WorkgroupSize= 256 threads/block
Checking computed result for correctness: Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

89
bin/x86_64/linux/release/APM_graphMemoryFootprint.txt

@ -1,89 +0,0 @@ @@ -1,89 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Driver version is: 12.0
Running sample.
================================
Running virtual address reuse example.
Sequential allocations & frees within a single graph enable CUDA to reuse virtual addresses.
Check confirms that d_a and d_b share a virtual address.
FOOTPRINT: 67108864 bytes
Cleaning up example by trimming device memory.
FOOTPRINT: 0 bytes
================================
Running physical memory reuse example.
CUDA reuses the same physical memory for allocations from separate graphs when the allocation lifetimes don't overlap.
Creating the graph execs does not reserve any physical memory.
FOOTPRINT: 0 bytes
The first graph launched reserves the memory it needs.
FOOTPRINT: 67108864 bytes
A subsequent launch of the same graph in the same stream reuses the same physical memory. Thus the memory footprint does not grow here.
FOOTPRINT: 67108864 bytes
Subsequent launches of other graphs in the same stream also reuse the physical memory. Thus the memory footprint does not grow here.
01: FOOTPRINT: 67108864 bytes
02: FOOTPRINT: 67108864 bytes
03: FOOTPRINT: 67108864 bytes
04: FOOTPRINT: 67108864 bytes
05: FOOTPRINT: 67108864 bytes
06: FOOTPRINT: 67108864 bytes
07: FOOTPRINT: 67108864 bytes
Check confirms all graphs use a different virtual address.
Cleaning up example by trimming device memory.
FOOTPRINT: 0 bytes
================================
Running simultaneous streams example.
Graphs that can run concurrently need separate physical memory. In this example, each graph launched in a separate stream increases the total memory footprint.
When launching a new graph, CUDA may reuse physical memory from a graph whose execution has already finished -- even if the new graph is being launched in a different stream from the completed graph. Therefore, a kernel node is added to the graphs to increase runtime.
Initial footprint:
FOOTPRINT: 0 bytes
Each graph launch in a seperate stream grows the memory footprint:
01: FOOTPRINT: 67108864 bytes
02: FOOTPRINT: 134217728 bytes
03: FOOTPRINT: 201326592 bytes
04: FOOTPRINT: 268435456 bytes
05: FOOTPRINT: 335544320 bytes
06: FOOTPRINT: 402653184 bytes
07: FOOTPRINT: 402653184 bytes
Cleaning up example by trimming device memory.
FOOTPRINT: 0 bytes
================================
Running unfreed streams example.
CUDA cannot reuse phyiscal memory from graphs which do not free their allocations.
Despite being launched in the same stream, each graph launch grows the memory footprint. Since the allocation is not freed, CUDA keeps the memory valid for use.
00: FOOTPRINT: 67108864 bytes
01: FOOTPRINT: 134217728 bytes
02: FOOTPRINT: 201326592 bytes
03: FOOTPRINT: 268435456 bytes
04: FOOTPRINT: 335544320 bytes
05: FOOTPRINT: 402653184 bytes
06: FOOTPRINT: 469762048 bytes
07: FOOTPRINT: 536870912 bytes
Trimming does not impact the memory footprint since the un-freed allocations are still holding onto the memory.
FOOTPRINT: 536870912 bytes
Freeing the allocations does not shrink the footprint.
FOOTPRINT: 536870912 bytes
Since the allocations are now freed, trimming does reduce the footprint even when the graph execs are not yet destroyed.
FOOTPRINT: 0 bytes
Cleaning up example by trimming device memory.
FOOTPRINT: 0 bytes
================================
Sample complete.

34
bin/x86_64/linux/release/APM_graphMemoryNodes.txt

@ -1,34 +0,0 @@ @@ -1,34 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Driver version is: 12.0
Setting up sample.
Setup complete.
Running negateSquares in a stream.
Validating negateSquares in a stream...
Validation PASSED!
Running negateSquares in a stream-captured graph.
Validating negateSquares in a stream-captured graph...
Validation PASSED!
Running negateSquares in an explicitly constructed graph.
Check verified that d_negSquare and d_input share a virtual address.
Validating negateSquares in an explicitly constructed graph...
Validation PASSED!
Running negateSquares with d_negSquare freed outside the stream.
Check verified that d_negSquare and d_input share a virtual address.
Validating negateSquares with d_negSquare freed outside the stream...
Validation PASSED!
Running negateSquares with d_negSquare freed outside the graph.
Validating negateSquares with d_negSquare freed outside the graph...
Validation PASSED!
Running negateSquares with d_negSquare freed in a different graph.
Validating negateSquares with d_negSquare freed in a different graph...
Validation PASSED!
Cleaning up sample.
Cleanup complete. Exiting sample.

48
bin/x86_64/linux/release/APM_histogram.txt

@ -1,48 +0,0 @@ @@ -1,48 +0,0 @@
[[histogram]] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
CUDA device [NVIDIA H100 PCIe] has 114 Multi-Processors, Compute 9.0
Initializing data...
...allocating CPU memory.
...generating input data
...allocating GPU memory and copying input data
Starting up 64-bin histogram...
Running 64-bin GPU histogram for 67108864 bytes (16 runs)...
histogram64() time (average) : 0.00007 sec, 916944.3386 MB/sec
histogram64, Throughput = 916944.3386 MB/s, Time = 0.00007 s, Size = 67108864 Bytes, NumDevsUsed = 1, Workgroup = 64
Validating GPU results...
...reading back GPU results
...histogram64CPU()
...comparing the results...
...64-bin histograms match
Shutting down 64-bin histogram...
Initializing 256-bin histogram...
Running 256-bin GPU histogram for 67108864 bytes (16 runs)...
histogram256() time (average) : 0.00018 sec, 379951.1088 MB/sec
histogram256, Throughput = 379951.1088 MB/s, Time = 0.00018 s, Size = 67108864 Bytes, NumDevsUsed = 1, Workgroup = 192
Validating GPU results...
...reading back GPU results
...histogram256CPU()
...comparing the results
...256-bin histograms match
Shutting down 256-bin histogram...
Shutting down...
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
[histogram] - Test Summary
Test passed

11
bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
Initializing...
GPU Device 0: "Hopper" with compute capability 9.0
M: 4096 (16 x 256)
N: 4096 (16 x 256)
K: 4096 (16 x 256)
Preparing data for GPU...
Required shared memory size: 64 Kb
Computing... using high performance kernel compute_gemm_imma
Time: 0.629184 ms
TOPS: 218.44

4
bin/x86_64/linux/release/APM_inlinePTX.txt

@ -1,4 +0,0 @@ @@ -1,4 +0,0 @@
CUDA inline PTX assembler sample
GPU Device 0: "Hopper" with compute capability 9.0
Test Successful.

5
bin/x86_64/linux/release/APM_inlinePTX_nvrtc.txt

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
CUDA inline PTX assembler sample
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Test Successful.

15
bin/x86_64/linux/release/APM_interval.txt

@ -1,15 +0,0 @@ @@ -1,15 +0,0 @@
[Interval Computing] starting ...
GPU Device 0: "Hopper" with compute capability 9.0
> GPU Device has Compute Capabilities SM 9.0
GPU naive implementation
Searching for roots in [0.01, 4]...
Found 2 intervals that may contain the root(s)
i[0] = [0.999655515093009, 1.00011722206639]
i[1] = [1.00011907576551, 1.00044661086269]
Number of equations solved: 65536
Time per equation: 0.616870105266571 us
Check against Host computation...

9
bin/x86_64/linux/release/APM_jacobiCudaGraphs.txt

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
CPU iterations : 2954
CPU error : 4.988e-03
CPU Processing time: 2525.311035 (ms)
GPU iterations : 2954
GPU error : 4.988e-03
GPU Processing time: 57.967999 (ms)
&&&& jacobiCudaGraphs PASSED

0
bin/x86_64/linux/release/APM_libcuhook.so.1.txt

7
bin/x86_64/linux/release/APM_lineOfSight.txt

@ -1,7 +0,0 @@ @@ -1,7 +0,0 @@
[./lineOfSight] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Line of sight
Average time: 0.020620 ms
Test passed

10
bin/x86_64/linux/release/APM_matrixMul.txt

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
[Matrix Multiply Using CUDA] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
MatrixA(320,320), MatrixB(640,320)
Computing result using CUDA Kernel...
done
Performance= 4756.03 GFlop/s, Time= 0.028 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

12
bin/x86_64/linux/release/APM_matrixMulCUBLAS.txt

@ -1,12 +0,0 @@ @@ -1,12 +0,0 @@
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Hopper" with compute capability 9.0
GPU Device 0: "NVIDIA H100 PCIe" with compute capability 9.0
MatrixA(640,480), MatrixB(480,320), MatrixC(640,320)
Computing result using CUBLAS...done.
Performance= 10873.05 GFlop/s, Time= 0.018 msec, Size= 196608000 Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

11
bin/x86_64/linux/release/APM_matrixMulDrv.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
[ matrixMulDrv (Driver API) ]
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Total amount of global memory: 85021163520 bytes
> findModulePath found file at <./matrixMul_kernel64.fatbin>
> initCUDA loading module: <./matrixMul_kernel64.fatbin>
> 32 block size selected
Processing time: 0.058000 (ms)
Checking computed result for correctness: Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

6
bin/x86_64/linux/release/APM_matrixMulDynlinkJIT.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
[ matrixMulDynlinkJIT (CUDA dynamic linking) ]
> Device 0: "NVIDIA H100 PCIe" with Compute 9.0 capability
> Compiling CUDA module
> PTX JIT log:
Test run success!

9
bin/x86_64/linux/release/APM_matrixMul_nvrtc.txt

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
[Matrix Multiply Using CUDA] - Starting...
MatrixA(320,320), MatrixB(640,320)
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Computing result using CUDA Kernel...
Checking computed result for correctness: Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

6
bin/x86_64/linux/release/APM_memMapIPCDrv.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
> findModulePath found file at <./memMapIpc_kernel64.ptx>
> initCUDA loading module: <./memMapIpc_kernel64.ptx>
> PTX JIT log:
Step 0 done
Process 0: verifying...

17
bin/x86_64/linux/release/APM_mergeSort.txt

@ -1,17 +0,0 @@ @@ -1,17 +0,0 @@
./mergeSort Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Allocating and initializing host arrays...
Allocating and initializing CUDA arrays...
Initializing GPU merge sort...
Running GPU merge sort...
Time: 1.344000 ms
Reading back GPU merge sort results...
Inspecting the results...
...inspecting keys array: OK
...inspecting keys and values array: OK
...stability property: stable!
Shutting down...

11
bin/x86_64/linux/release/APM_newdelete.txt

@ -1,11 +0,0 @@ @@ -1,11 +0,0 @@
newdelete Starting...
GPU Device 0: "Hopper" with compute capability 9.0
> Container = Vector test OK
> Container = Vector, using placement new on SMEM buffer test OK
> Container = Vector, with user defined datatype test OK
Test Summary: 3/3 succesfully run

56
bin/x86_64/linux/release/APM_nvJPEG.txt

@ -1,56 +0,0 @@ @@ -1,56 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Using GPU 0 (NVIDIA H100 PCIe, 114 SMs, 2048 th/SM max, CC 9.0, ECC on)
Decoding images in directory: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/, total 8, batchsize 1
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img1.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img2.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img3.jpg
Image is 3 channels.
Channel #0 size: 640 x 426
Channel #1 size: 320 x 213
Channel #2 size: 320 x 213
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img4.jpg
Image is 3 channels.
Channel #0 size: 640 x 426
Channel #1 size: 320 x 213
Channel #2 size: 320 x 213
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img5.jpg
Image is 3 channels.
Channel #0 size: 640 x 480
Channel #1 size: 320 x 240
Channel #2 size: 320 x 240
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img6.jpg
Image is 3 channels.
Channel #0 size: 640 x 480
Channel #1 size: 320 x 240
Channel #2 size: 320 x 240
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img7.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Processing: ../../../../Samples/4_CUDA_Libraries/nvJPEG/images/img8.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Total decoding time: 3.19197
Avg decoding time per image: 0.398996
Avg images per sec: 2.50629
Avg decoding time per batch: 0.398996

62
bin/x86_64/linux/release/APM_nvJPEG_encoder.txt

@ -1,62 +0,0 @@ @@ -1,62 +0,0 @@
GPU Device 0: "Hopper" with compute capability 9.0
Using GPU 0 (NVIDIA H100 PCIe, 114 SMs, 2048 th/SM max, CC 9.0, ECC on)
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img1.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img1.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img2.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img2.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img3.jpg
Image is 3 channels.
Channel #0 size: 640 x 426
Channel #1 size: 320 x 213
Channel #2 size: 320 x 213
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img3.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img4.jpg
Image is 3 channels.
Channel #0 size: 640 x 426
Channel #1 size: 320 x 213
Channel #2 size: 320 x 213
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img4.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img5.jpg
Image is 3 channels.
Channel #0 size: 640 x 480
Channel #1 size: 320 x 240
Channel #2 size: 320 x 240
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img5.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img6.jpg
Image is 3 channels.
Channel #0 size: 640 x 480
Channel #1 size: 320 x 240
Channel #2 size: 320 x 240
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img6.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img7.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img7.jpg
Processing file: ../../../../Samples/4_CUDA_Libraries/nvJPEG_encoder/images/img8.jpg
Image is 3 channels.
Channel #0 size: 480 x 640
Channel #1 size: 240 x 320
Channel #2 size: 240 x 320
YUV 4:2:0 chroma subsampling
Writing JPEG file: encode_output/img8.jpg
Total images processed: 8
Total time spent on encoding: 1.9711
Avg time/image: 0.246388

35
bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt

@ -1,35 +0,0 @@ @@ -1,35 +0,0 @@
[P2P (Peer-to-Peer) GPU Bandwidth Latency Test]
Device: 0, NVIDIA H100 PCIe, pciBusID: c1, pciDeviceID: 0, pciDomainID:0
***NOTE: In case a device doesn't have P2P access to other one, it falls back to normal memcopy procedure.
So you can see lesser Bandwidth (GB/s) and unstable Latency (us) in those cases.
P2P Connectivity Matrix
D\D 0
0 1
Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)
D\D 0
0 1628.72
Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)
D\D 0
0 1625.75
Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)
D\D 0
0 1668.11
Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)
D\D 0
0 1668.39
P2P=Disabled Latency Matrix (us)
GPU 0
0 2.67
CPU 0
0 2.04
P2P=Enabled Latency (P2P Writes) Matrix (us)
GPU 0
0 2.68
CPU 0
0 2.02
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

15
bin/x86_64/linux/release/APM_ptxjit.txt

@ -1,15 +0,0 @@ @@ -1,15 +0,0 @@
[PTX Just In Time (JIT) Compilation (no-qatest)] - Starting...
> Using CUDA Device [0]: NVIDIA H100 PCIe
> findModulePath <./ptxjit_kernel64.ptx>
> initCUDA loading module: <./ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'myKernel' for 'sm_90a'
ptxas info : Function properties for myKernel
ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 8 registers
info : 0 bytes gmem
info : Function properties for 'myKernel':
info : used 8 registers, 0 stack, 0 bytes smem, 536 bytes cmem[0], 0 bytes lmem
CUDA kernel launched

24
bin/x86_64/linux/release/APM_quasirandomGenerator.txt

@ -1,24 +0,0 @@ @@ -1,24 +0,0 @@
./quasirandomGenerator Starting...
Allocating GPU memory...
Allocating CPU memory...
Initializing QRNG tables...
Testing QRNG...
quasirandomGenerator, Throughput = 51.2334 GNumbers/s, Time = 0.00006 s, Size = 3145728 Numbers, NumDevsUsed = 1, Workgroup = 384
Reading GPU results...
Comparing to the CPU results...
L1 norm: 7.275964E-12
Testing inverseCNDgpu()...
quasirandomGenerator-inverse, Throughput = 116.2931 GNumbers/s, Time = 0.00003 s, Size = 3145728 Numbers, NumDevsUsed = 1, Workgroup = 128
Reading GPU results...
Comparing to the CPU results...
L1 norm: 9.439909E-08
Shutting down...

27
bin/x86_64/linux/release/APM_quasirandomGenerator_nvrtc.txt

@ -1,27 +0,0 @@ @@ -1,27 +0,0 @@
./quasirandomGenerator_nvrtc Starting...
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Allocating GPU memory...
Allocating CPU memory...
Initializing QRNG tables...
Testing QRNG...
quasirandomGenerator, Throughput = 45.0355 GNumbers/s, Time = 0.00007 s, Size = 3145728 Numbers, NumDevsUsed = 1, Workgroup = 384
Reading GPU results...
Comparing to the CPU results...
L1 norm: 7.275964E-12
Testing inverseCNDgpu()...
quasirandomGenerator-inverse, Throughput = 94.7508 GNumbers/s, Time = 0.00003 s, Size = 3145728 Numbers, NumDevsUsed = 1, Workgroup = 128
Reading GPU results...
Comparing to the CPU results...
L1 norm: 9.439909E-08
Shutting down...

9
bin/x86_64/linux/release/APM_radixSortThrust.txt

@ -1,9 +0,0 @@ @@ -1,9 +0,0 @@
./radixSortThrust Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Sorting 1048576 32-bit unsigned int keys and values
radixSortThrust, Throughput = 2276.9744 MElements/s, Time = 0.00046 s, Size = 1048576 elements
Test passed

18
bin/x86_64/linux/release/APM_reduction.txt

@ -1,18 +0,0 @@ @@ -1,18 +0,0 @@
./reduction Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Using Device 0: NVIDIA H100 PCIe
Reducing array of type int
16777216 elements
256 threads (max)
64 blocks
Reduction, Throughput = 49.0089 GB/s, Time = 0.00137 s, Size = 16777216 Elements, NumDevsUsed = 1, Workgroup = 256
GPU result = 2139353471
CPU result = 2139353471
Test passed

14
bin/x86_64/linux/release/APM_reductionMultiBlockCG.txt

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
reductionMultiBlockCG Starting...
GPU Device 0: "Hopper" with compute capability 9.0
33554432 elements
numThreads: 1024
numBlocks: 228
Launching SinglePass Multi Block Cooperative Groups kernel
Average time: 0.102750 ms
Bandwidth: 1306.254555 GB/s
GPU result = 1.992401599884
CPU result = 1.992401361465

19
bin/x86_64/linux/release/APM_scalarProd.txt

@ -1,19 +0,0 @@ @@ -1,19 +0,0 @@
./scalarProd Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Initializing data...
...allocating CPU memory.
...allocating GPU memory.
...generating input data in CPU mem.
...copying input data to GPU mem.
Data init done.
Executing GPU kernel...
GPU time: 0.042000 msecs.
Reading back GPU result...
Checking GPU results...
..running CPU scalar product calculation
...comparing the results
Shutting down...
L1 error: 2.745062E-08
Test passed

138
bin/x86_64/linux/release/APM_scan.txt

@ -1,138 +0,0 @@ @@ -1,138 +0,0 @@
./scan Starting...
GPU Device 0: "Hopper" with compute capability 9.0
Allocating and initializing host arrays...
Allocating and initializing CUDA arrays...
Initializing CUDA-C scan...
*** Running GPU scan for short arrays (100 identical iterations)...
Running scan for 4 elements (1703936 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 8 elements (851968 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 16 elements (425984 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 32 elements (212992 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 64 elements (106496 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 128 elements (53248 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 256 elements (26624 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 512 elements (13312 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 1024 elements (6656 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
scan, Throughput = 35.1769 MElements/s, Time = 0.00003 s, Size = 1024 Elements, NumDevsUsed = 1, Workgroup = 256
***Running GPU scan for large arrays (100 identical iterations)...
Running scan for 2048 elements (3328 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 4096 elements (1664 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 8192 elements (832 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 16384 elements (416 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 32768 elements (208 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 65536 elements (104 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 131072 elements (52 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
Running scan for 262144 elements (26 arrays)...
Validating the results...
...reading back GPU results
...scanExclusiveHost()
...comparing the results
...Results Match
scan, Throughput = 5146.1328 MElements/s, Time = 0.00005 s, Size = 262144 Elements, NumDevsUsed = 1, Workgroup = 256
Shutting down...

6
bin/x86_64/linux/release/APM_segmentationTreeThrust.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
./segmentationTreeThrust Starting...
GPU Device 0: "Hopper" with compute capability 9.0
* Building segmentation tree... done in 24.6388 (ms)
* Dumping levels for each tree...

24
bin/x86_64/linux/release/APM_shfl_scan.txt

@ -1,24 +0,0 @@ @@ -1,24 +0,0 @@
Starting shfl_scan
GPU Device 0: "Hopper" with compute capability 9.0
> Detected Compute SM 9.0 hardware with 114 multi-processors
Starting shfl_scan
GPU Device 0: "Hopper" with compute capability 9.0
> Detected Compute SM 9.0 hardware with 114 multi-processors
Computing Simple Sum test
---------------------------------------------------
Initialize test data [1, 1, 1...]
Scan summation for 65536 elements, 256 partial sums
Partial summing 256 elements with 1 blocks of size 256
Test Sum: 65536
Time (ms): 0.021504
65536 elements scanned in 0.021504 ms -> 3047.619141 MegaElements/s
CPU verify result diff (GPUvsCPU) = 0
CPU sum (naive) took 0.017810 ms
Computing Integral Image Test on size 1920 x 1080 synthetic data
---------------------------------------------------
Method: Fast Time (GPU Timer): 0.008032 ms Diff = 0
Method: Vertical Scan Time (GPU Timer): 0.068576 ms
CheckSum: 2073600, (expect 1920x1080=2073600)

6
bin/x86_64/linux/release/APM_simpleAWBarrier.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
./simpleAWBarrier starting...
GPU Device 0: "Hopper" with compute capability 9.0
Launching normVecByDotProductAWBarrier kernel with numBlocks = 228 blockSize = 576
Result = PASSED
./simpleAWBarrier completed, returned OK

16
bin/x86_64/linux/release/APM_simpleAssert.txt

@ -1,16 +0,0 @@ @@ -1,16 +0,0 @@
simpleAssert starting...
OS_System_Type.release = 5.4.0-131-generic
OS Info: <#147-Ubuntu SMP Fri Oct 14 17:07:22 UTC 2022>
GPU Device 0: "Hopper" with compute capability 9.0
Launch kernel to generate assertion failures
-- Begin assert output
-- End assert output
Device assert failed as expected, CUDA error message is: device-side assert triggered
simpleAssert completed, returned OK

12
bin/x86_64/linux/release/APM_simpleAssert_nvrtc.txt

@ -1,12 +0,0 @@ @@ -1,12 +0,0 @@
simpleAssert_nvrtc starting...
Launch kernel to generate assertion failures
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
-- Begin assert output
-- End assert output
Device assert failed as expected

5
bin/x86_64/linux/release/APM_simpleAtomicIntrinsics.txt

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
simpleAtomicIntrinsics starting...
GPU Device 0: "Hopper" with compute capability 9.0
Processing time: 2.438000 (ms)
simpleAtomicIntrinsics completed, returned OK

6
bin/x86_64/linux/release/APM_simpleAtomicIntrinsics_nvrtc.txt

@ -1,6 +0,0 @@ @@ -1,6 +0,0 @@
simpleAtomicIntrinsics_nvrtc starting...
> Using CUDA Device [0]: NVIDIA H100 PCIe
> Using CUDA Device [0]: NVIDIA H100 PCIe
> GPU Device has SM 9.0 compute capability
Processing time: 0.171000 (ms)
simpleAtomicIntrinsics_nvrtc completed, returned OK

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save