@ -39,12 +39,10 @@
@@ -39,12 +39,10 @@
#include <cuda_runtime.h>
// includes
#include <cassert>
#include <cuda.h>
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <memory>
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
@@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
"Device to Device", NULL};
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
@ -97,36 +94,62 @@ char **pArgv = NULL;
@@ -97,36 +94,62 @@ char **pArgv = NULL;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
void testBandwidth(unsigned int start,
unsigned int end,
unsigned int increment,
testMode mode,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmod e,
memoryMode memMode, int startDevice, int endDevice,
void testBandwidthQuick(unsigned int size ,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevic e,
int endDevice,
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
void testBandwidthRange(unsigned int start,
unsigned int end,
unsigned int increment,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
void testBandwidthShmoo(memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc);
void printResultsReadable(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc);
void printResultsCSV(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc);
void printHelp(void);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
int main(int argc, char **argv)
{
pArgc = &argc;
pArgv = argv;
@ -144,8 +167,7 @@ int main(int argc, char **argv) {
@@ -144,8 +167,7 @@ int main(int argc, char **argv) {
// finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
free(flush_buf);
@ -156,7 +178,8 @@ int main(int argc, char **argv) {
@@ -156,7 +178,8 @@ int main(int argc, char **argv) {
///////////////////////////////////////////////////////////////////////////////
// Parse args, run the appropriate tests
///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv) {
int runTest(const int argc, const char **argv)
{
int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int startDevice = 0;
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
@@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) {
}
else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED;
} else {
}
else {
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n");
return -1000;
}
} else {
}
else {
// default - pinned memory
memMode = PINNED;
}
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
@@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
@@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
}
if (strcmp(device, "all") == 0) {
printf(
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n");
startDevice = 0;
endDevice = deviceCount - 1;
} else {
}
else {
startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) {
printf(
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n",
startDevice, 0);
startDevice,
0);
startDevice = endDevice = 0;
}
}
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
@@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
@@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
exit(EXIT_FAILURE);
}
} else {
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
}
else {
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
@@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) {
}
else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) {
}
else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n");
mode = RANGE_MODE;
} else {
}
else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n");
return -3000;
}
} else {
}
else {
// default mode - quick
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
@@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start must be greater than zero\n");
return -4000;
}
} else {
}
else {
printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n");
return -5000;
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
@@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - start is greater than end\n");
return -7000;
}
} else {
}
else {
printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n");
return -8000;
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
@@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
printf("Illegal argument - increment must be greater than zero\n");
return -9000;
}
} else {
}
else {
printf("Must specify an increment in user mode\n");
printf("See --help for more information\n");
return -10000;
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
@@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
}
if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
testBandwidth((unsigned int)start,
(unsigned int)end,
(unsigned int)increment,
mode,
HOST_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
memMode, startDevice, endDevice, wc);
testBandwidth((unsigned int)start,
(unsigned int)end,
(unsigned int)increment,
mode,
DEVICE_TO_HOST,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
testBandwidth((unsigned int)start,
(unsigned int)end,
(unsigned int)increment,
mode,
DEVICE_TO_DEVICE,
printmode,
memMode,
startDevice,
endDevice,
wc);
}
// Ensure that we reset all CUDA Devices in question
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
@@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
void testBandwidth(unsigned int start,
unsigned int end,
unsigned int increment,
testMode mode,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
switch (mode) {
case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
endDevice, wc);
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
startDevice, endDevice, wc);
testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
break;
case SHMOO_MODE:
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
@@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
//////////////////////////////////////////////////////////////////////
// Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
startDevice, endDevice, wc);
void testBandwidthQuick(unsigned int size,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
}
///////////////////////////////////////////////////////////////////////
// Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc) {
void testBandwidthRange(unsigned int start,
unsigned int end,
unsigned int increment,
memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment);
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
@@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
cudaSetDevice(currentDevice);
// run each of the copies
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
@@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
// print results
if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
}
else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
}
// clean up
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
@@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
//////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments
//////////////////////////////////////////////////////////////////////////////
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
void testBandwidthShmoo(memcpyKind kind,
printMode printmode,
memoryMode memMode,
int startDevice,
int endDevice,
bool wc)
{
// count the number of copies to make
unsigned int count =
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
+ ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
+ ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
+ ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
+ ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
+ ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
+ ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
@@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
cudaSetDevice(currentDevice);
// Run the shmoo
int iteration = 0;
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
@@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) {
}
else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) {
}
else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) {
}
else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) {
}
else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) {
}
else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB;
} else {
}
else {
memSize += SHMOO_INCREMENT_4MB;
}
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
@@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[iteration] +=
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[iteration] +=
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[iteration] +=
testDeviceToDeviceTransfer(memSizes[iteration]);
bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
break;
}
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
@@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
printf("\n");
if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
}
else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
}
// clean up
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
@@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
@@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
#else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
}
else {
// pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
@@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
cudaMemcpyDeviceToHost, 0));
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
@@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
}
else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
@@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
}
else {
free(h_idata);
free(h_odata);
}
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
@@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
@@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
#else
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
}
else {
// pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
@@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
// copy host memory to device memory
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
if (bDontUseGPUTiming)
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
cudaMemcpyHostToDevice, 0));
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
@@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
}
else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
@@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata));
} else {
}
else {
free(h_odata);
}
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
@@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToDeviceTransfer(unsigned int memSize) {
float testDeviceToDeviceTransfer(unsigned int memSize)
{
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
@@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
}
checkCudaErrors(cudaEventRecord(stop, 0));
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
@@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
/////////////////////////////////////////////////////////
// print results in an easily read format
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc) {
void printResultsReadable(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
@@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int i;
for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
///////////////////////////////////////////////////////////////////////////
// print results in a database format
///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc) {
void printResultsCSV(unsigned int *memSizes,
double *bandwidths,
unsigned int count,
memcpyKind kind,
memoryMode memMode,
int iNumDevs,
bool wc)
{
std::string sConfig;
// log config information
if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D";
} else {
}
else {
if (kind == DEVICE_TO_HOST) {
sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) {
}
else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D";
}
if (memMode == PAGEABLE) {
sConfig += "-Paged";
} else if (memMode == PINNED) {
}
else if (memMode == PINNED) {
sConfig += "-Pinned";
if (wc) {
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
@@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
printf(
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
sConfig.c_str(),
bandwidths[i],
dSeconds,
memSizes[i],
iNumDevs);
}
}
///////////////////////////////////////////////////////////////////////////
// Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
void printHelp(void)
{
printf("Usage: bandwidthTest [OPTION]...\n");
printf(
"Test the bandwidth for device to host, host to device, and device to "
printf("Test the bandwidth for device to host, host to device, and device to "
"device transfers\n");
printf("\n");
printf(
"Example: measure the bandwidth of device to host pinned memory copies "
printf("Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf(
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n");
printf("\n");