@ -39,12 +39,10 @@
#include <cuda_runtime.h>
#include <cuda_runtime.h>
// includes
// includes
#include <cassert>
#include <cuda.h>
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <iostream>
#include <memory>
#include <memory>
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };
enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL};
"Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
@ -97,36 +94,62 @@ char **pArgv = NULL;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
// declaration, forward
int runTest(const int argc, const char **argv);
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
void testBandwidth(unsigned int start,
testMode mode, memcpyKind kind, printMode printmode,
unsigned int end,
memoryMode memMode, int startDevice, int endDevice, bool wc);
unsigned int increment,
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
testMode mode,
memoryMode memMode, int startDevice, int endDevice,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
void testBandwidthQuick(unsigned int size ,
unsigned int increment, memcpyKind kind,
memcpyKind kind,
printMode printmode, memoryMode memMode,
printMode printmode,
int startDevice, int endDevice, bool wc);
memoryMode memMode,
void testBandwidthShmoo(memcpyKind kind, printMode printmod e,
int startDevic e,
memoryMode memMode, int startDevice, int endDevice,
int endDevice,
bool wc);
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
void testBandwidthRange(unsigned int start,
unsigned int end,
unsigned int increment,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
void testBandwidthShmoo(memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
void printResultsReadable(unsigned int *memSizes,
unsigned int count, memcpyKind kind,
double *bandwidths,
memoryMode memMode, int iNumDevs, bool wc);
unsigned int count,
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
memcpyKind kind,
unsigned int count, memcpyKind kind, memoryMode memMode,
memoryMode memMode,
int iNumDevs, bool wc);
int iNumDevs,
bool wc);
void printResultsCSV(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc);
void printHelp(void);
void printHelp(void);
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Program main
// Program main
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
int main(int argc, char **argv)
{
pArgc = &argc;
pArgc = &argc;
pArgv = argv;
pArgv = argv;
@ -144,8 +167,7 @@ int main(int argc, char **argv) {
// finish
// finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf(
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
"Results may vary when GPU Boost is enabled.\n");
free(flush_buf);
free(flush_buf);
@ -156,7 +178,8 @@ int main(int argc, char **argv) {
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Parse args, run the appropriate tests
// Parse args, run the appropriate tests
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv) {
int runTest(const int argc, const char **argv)
{
int start = DEFAULT_SIZE;
int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int startDevice = 0;
int startDevice = 0;
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) {
if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE;
memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) {
}
else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED;
memMode = PINNED;
} else {
}
else {
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n");
printf("See --help for more information\n");
return -1000;
return -1000;
}
}
} else {
}
else {
// default - pinned memory
// default - pinned memory
memMode = PINNED;
memMode = PINNED;
}
}
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
}
}
if (strcmp(device, "all") == 0) {
if (strcmp(device, "all") == 0) {
printf(
printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n");
"!!!!!!\n\n");
startDevice = 0;
startDevice = 0;
endDevice = deviceCount - 1;
endDevice = deviceCount - 1;
} else {
}
else {
startDevice = endDevice = atoi(device);
startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) {
if (startDevice >= deviceCount || startDevice < 0) {
printf(
printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n",
"used !!!!!\n",
startDevice, 0);
startDevice,
0);
startDevice = endDevice = 0;
startDevice = endDevice = 0;
}
}
}
}
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
printf("Running on...\n\n");
printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice;
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaDeviceProp deviceProp;
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
}
} else {
}
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
else {
cudaGetErrorString(error_id));
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice));
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
if (strcmp(modeStr, "quick") == 0) {
if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n");
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) {
}
else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n");
printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE;
mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) {
}
else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n");
printf(" Range Mode\n\n");
mode = RANGE_MODE;
mode = RANGE_MODE;
} else {
}
else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n");
printf("See --help for more information\n");
return -3000;
return -3000;
}
}
} else {
}
else {
// default mode - quick
// default mode - quick
printf(" Quick Mode\n\n");
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
mode = QUICK_MODE;
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start must be greater than zero\n");
printf("Illegal argument - start must be greater than zero\n");
return -4000;
return -4000;
}
}
} else {
}
else {
printf("Must specify a starting size in range mode\n");
printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n");
printf("See --help for more information\n");
return -5000;
return -5000;
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start is greater than end\n");
printf("Illegal argument - start is greater than end\n");
return -7000;
return -7000;
}
}
} else {
}
else {
printf("Must specify an end size in range mode.\n");
printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n");
printf("See --help for more information\n");
return -8000;
return -8000;
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - increment must be greater than zero\n");
printf("Illegal argument - increment must be greater than zero\n");
return -9000;
return -9000;
}
}
} else {
}
else {
printf("Must specify an increment in user mode\n");
printf("Must specify an increment in user mode\n");
printf("See --help for more information\n");
printf("See --help for more information\n");
return -10000;
return -10000;
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
}
}
if (htod) {
if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end,
testBandwidth((unsigned int)start,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
(unsigned int)end,
memMode, startDevice, endDevice, wc);
(unsigned int)increment,
mode,
HOST_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
}
if (dtoh) {
if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end,
testBandwidth((unsigned int)start,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
(unsigned int)end,
memMode, startDevice, endDevice, wc);
(unsigned int)increment,
mode,
DEVICE_TO_HOST,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
}
if (dtod) {
if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end,
testBandwidth((unsigned int)start,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
(unsigned int)end,
memMode, startDevice, endDevice, wc);
(unsigned int)increment,
mode,
DEVICE_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
}
// Ensure that we reset all CUDA Devices in question
// Ensure that we reset all CUDA Devices in question
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test
// Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
void testBandwidth(unsigned int start,
testMode mode, memcpyKind kind, printMode printmode,
unsigned int end,
memoryMode memMode, int startDevice, int endDevice,
unsigned int increment,
bool wc) {
testMode mode,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
switch (mode) {
switch (mode) {
case QUICK_MODE:
case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
endDevice, wc);
break;
break;
case RANGE_MODE:
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
startDevice, endDevice, wc);
break;
break;
case SHMOO_MODE:
case SHMOO_MODE:
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
// Run a quick mode bandwidth test
// Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
void testBandwidthQuick(unsigned int size,
memoryMode memMode, int startDevice, int endDevice,
memcpyKind kind,
bool wc) {
printMode printmode,
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
memoryMode memMode,
startDevice, endDevice, wc);
int startDevice,
int endDevice,
bool wc)
{
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
}
}
///////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////
// Run a range mode bandwidth test
// Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
void testBandwidthRange(unsigned int start, unsigned int end,
void testBandwidthRange(unsigned int start,
unsigned int increment, memcpyKind kind,
unsigned int end,
printMode printmode, memoryMode memMode,
unsigned int increment,
int startDevice, int endDevice, bool wc) {
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies we're going to run
// count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment);
unsigned int count = 1 + ((end - start) / increment);
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
}
}
// Use the device asked by the user
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaSetDevice(currentDevice);
cudaSetDevice(currentDevice);
// run each of the copies
// run each of the copies
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
// print results
// print results
if (printmode == CSV) {
if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
}
} else {
else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
}
}
// clean up
// clean up
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments
// Intense shmoo mode - covers a large range of values with varying increments
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
void testBandwidthShmoo(memcpyKind kind,
memoryMode memMode, int startDevice, int endDevice,
printMode printmode,
bool wc) {
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies to make
// count the number of copies to make
unsigned int count =
unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
+ ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
+ ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
+ ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
+ ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
+ ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
+ ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
double *bandwidths = (double *)malloc(count * sizeof(double));
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
}
}
// Use the device asked by the user
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
currentDevice++) {
cudaSetDevice(currentDevice);
cudaSetDevice(currentDevice);
// Run the shmoo
// Run the shmoo
int iteration = 0;
int iteration = 0;
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
while (memSize <= SHMOO_MEMSIZE_MAX) {
while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) {
if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB;
memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) {
}
else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB;
memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) {
}
else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB;
memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) {
}
else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB;
memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) {
}
else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB;
memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) {
}
else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB;
memSize += SHMOO_INCREMENT_2MB;
} else {
}
else {
memSize += SHMOO_INCREMENT_4MB;
memSize += SHMOO_INCREMENT_4MB;
}
}
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
switch (kind) {
switch (kind) {
case DEVICE_TO_HOST:
case DEVICE_TO_HOST:
bandwidths[iteration] +=
bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break;
break;
case HOST_TO_DEVICE:
case HOST_TO_DEVICE:
bandwidths[iteration] +=
bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break;
break;
case DEVICE_TO_DEVICE:
case DEVICE_TO_DEVICE:
bandwidths[iteration] +=
bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
testDeviceToDeviceTransfer(memSizes[iteration]);
break;
break;
}
}
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
printf("\n");
printf("\n");
if (CSV == printmode) {
if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
}
} else {
else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
(1 + endDevice - startDevice), wc);
}
}
// clean up
// clean up
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
bool wc) {
{
StopWatchInterface *timer = NULL;
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory
// pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
#if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
#else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
#endif
} else {
}
else {
// pageable memory mode - use malloc
// pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize);
h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory
// initialize the device memory
checkCudaErrors(
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host
// copy data from GPU to Host
if (PINNED == memMode) {
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0));
cudaMemcpyDeviceToHost, 0));
}
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize());
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
sdkResetTimer(&timer);
}
}
} else {
}
else {
elapsedTimeInMs = 0;
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
sdkStartTimer(&timer);
checkCudaErrors(
checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
sdkResetTimer(&timer);
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
}
else {
free(h_idata);
free(h_idata);
free(h_odata);
free(h_odata);
}
}
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size
//! test the bandwidth of a host to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
bool wc) {
{
StopWatchInterface *timer = NULL;
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
#if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
(wc) ? cudaHostAllocWriteCombined : 0));
#else
#else
// pinned memory mode - use special function to get OS-pinned memory
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
#endif
} else {
}
else {
// pageable memory mode - use malloc
// pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
// copy host memory to device memory
// copy host memory to device memory
if (PINNED == memMode) {
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0));
cudaMemcpyHostToDevice, 0));
}
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize());
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
sdkResetTimer(&timer);
}
}
} else {
}
else {
elapsedTimeInMs = 0;
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
sdkStartTimer(&timer);
checkCudaErrors(
checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer);
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
sdkResetTimer(&timer);
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
}
else {
free(h_odata);
free(h_odata);
}
}
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
float testDeviceToDeviceTransfer(unsigned int memSize) {
float testDeviceToDeviceTransfer(unsigned int memSize)
{
StopWatchInterface *timer = NULL;
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory
// initialize memory
checkCudaErrors(
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy
// run the memcopy
sdkStartTimer(&timer);
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(
checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
}
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventRecord(stop, 0));
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
/////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////
// print results in an easily read format
// print results in an easily read format
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
void printResultsReadable(unsigned int *memSizes,
unsigned int count, memcpyKind kind,
double *bandwidths,
memoryMode memMode, int iNumDevs, bool wc) {
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int i;
unsigned int i;
for (i = 0; i < (count - 1); i++) {
for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
}
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
}
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
// print results in a database format
// print results in a database format
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
void printResultsCSV(unsigned int *memSizes,
unsigned int count, memcpyKind kind, memoryMode memMode,
double *bandwidths,
int iNumDevs, bool wc) {
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
std::string sConfig;
std::string sConfig;
// log config information
// log config information
if (kind == DEVICE_TO_DEVICE) {
if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D";
sConfig += "D2D";
} else {
}
else {
if (kind == DEVICE_TO_HOST) {
if (kind == DEVICE_TO_HOST) {
sConfig += "D2H";
sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) {
}
else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D";
sConfig += "H2D";
}
}
if (memMode == PAGEABLE) {
if (memMode == PAGEABLE) {
sConfig += "-Paged";
sConfig += "-Paged";
} else if (memMode == PINNED) {
}
else if (memMode == PINNED) {
sConfig += "-Pinned";
sConfig += "-Pinned";
if (wc) {
if (wc) {
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
for (i = 0; i < count; i++) {
for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
printf(
printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n",
"bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
sConfig.c_str(),
bandwidths[i],
dSeconds,
memSizes[i],
iNumDevs);
}
}
}
}
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
// Print help screen
// Print help screen
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
void printHelp(void)
{
printf("Usage: bandwidthTest [OPTION]...\n");
printf("Usage: bandwidthTest [OPTION]...\n");
printf(
printf("Test the bandwidth for device to host, host to device, and device to "
"Test the bandwidth for device to host, host to device, and device to "
"device transfers\n");
"device transfers\n");
printf("\n");
printf("\n");
printf(
printf("Example: measure the bandwidth of device to host pinned memory copies "
"Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf(
printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n");
"--increment=1024 --dtoh\n");
printf("\n");
printf("\n");