|
|
@ -39,12 +39,10 @@ |
|
|
|
#include <cuda_runtime.h> |
|
|
|
#include <cuda_runtime.h> |
|
|
|
|
|
|
|
|
|
|
|
// includes |
|
|
|
// includes |
|
|
|
|
|
|
|
#include <cassert> |
|
|
|
|
|
|
|
#include <cuda.h> |
|
|
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization |
|
|
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization |
|
|
|
#include <helper_functions.h> // helper for shared functions common to CUDA Samples |
|
|
|
#include <helper_functions.h> // helper for shared functions common to CUDA Samples |
|
|
|
|
|
|
|
|
|
|
|
#include <cuda.h> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cassert> |
|
|
|
|
|
|
|
#include <iostream> |
|
|
|
#include <iostream> |
|
|
|
#include <memory> |
|
|
|
#include <memory> |
|
|
|
|
|
|
|
|
|
|
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; |
|
|
|
enum printMode { USER_READABLE, CSV }; |
|
|
|
enum printMode { USER_READABLE, CSV }; |
|
|
|
enum memoryMode { PINNED, PAGEABLE }; |
|
|
|
enum memoryMode { PINNED, PAGEABLE }; |
|
|
|
|
|
|
|
|
|
|
|
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", |
|
|
|
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL}; |
|
|
|
"Device to Device", NULL}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; |
|
|
|
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; |
|
|
|
|
|
|
|
|
|
|
@ -97,36 +94,62 @@ char **pArgv = NULL; |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
// declaration, forward |
|
|
|
// declaration, forward |
|
|
|
int runTest(const int argc, const char **argv); |
|
|
|
int runTest(const int argc, const char **argv); |
|
|
|
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, |
|
|
|
void testBandwidth(unsigned int start, |
|
|
|
testMode mode, memcpyKind kind, printMode printmode, |
|
|
|
unsigned int end, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, bool wc); |
|
|
|
unsigned int increment, |
|
|
|
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, |
|
|
|
testMode mode, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, |
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
printMode printmode, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
bool wc); |
|
|
|
bool wc); |
|
|
|
void testBandwidthRange(unsigned int start, unsigned int end, |
|
|
|
void testBandwidthQuick(unsigned int size, |
|
|
|
unsigned int increment, memcpyKind kind, |
|
|
|
memcpyKind kind, |
|
|
|
printMode printmode, memoryMode memMode, |
|
|
|
printMode printmode, |
|
|
|
int startDevice, int endDevice, bool wc); |
|
|
|
memoryMode memMode, |
|
|
|
void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
int startDevice, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, |
|
|
|
int endDevice, |
|
|
|
bool wc); |
|
|
|
bool wc); |
|
|
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
void testBandwidthRange(unsigned int start, |
|
|
|
|
|
|
|
unsigned int end, |
|
|
|
|
|
|
|
unsigned int increment, |
|
|
|
|
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
printMode printmode, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
bool wc); |
|
|
|
bool wc); |
|
|
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
void testBandwidthShmoo(memcpyKind kind, |
|
|
|
|
|
|
|
printMode printmode, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
bool wc); |
|
|
|
bool wc); |
|
|
|
|
|
|
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc); |
|
|
|
|
|
|
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc); |
|
|
|
float testDeviceToDeviceTransfer(unsigned int memSize); |
|
|
|
float testDeviceToDeviceTransfer(unsigned int memSize); |
|
|
|
void printResultsReadable(unsigned int *memSizes, double *bandwidths, |
|
|
|
void printResultsReadable(unsigned int *memSizes, |
|
|
|
unsigned int count, memcpyKind kind, |
|
|
|
double *bandwidths, |
|
|
|
memoryMode memMode, int iNumDevs, bool wc); |
|
|
|
unsigned int count, |
|
|
|
void printResultsCSV(unsigned int *memSizes, double *bandwidths, |
|
|
|
memcpyKind kind, |
|
|
|
unsigned int count, memcpyKind kind, memoryMode memMode, |
|
|
|
memoryMode memMode, |
|
|
|
int iNumDevs, bool wc); |
|
|
|
int iNumDevs, |
|
|
|
|
|
|
|
bool wc); |
|
|
|
|
|
|
|
void printResultsCSV(unsigned int *memSizes, |
|
|
|
|
|
|
|
double *bandwidths, |
|
|
|
|
|
|
|
unsigned int count, |
|
|
|
|
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int iNumDevs, |
|
|
|
|
|
|
|
bool wc); |
|
|
|
void printHelp(void); |
|
|
|
void printHelp(void); |
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
// Program main |
|
|
|
// Program main |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
|
|
int main(int argc, char **argv) { |
|
|
|
int main(int argc, char **argv) |
|
|
|
|
|
|
|
{ |
|
|
|
pArgc = &argc; |
|
|
|
pArgc = &argc; |
|
|
|
pArgv = argv; |
|
|
|
pArgv = argv; |
|
|
|
|
|
|
|
|
|
|
@ -144,8 +167,7 @@ int main(int argc, char **argv) { |
|
|
|
// finish |
|
|
|
// finish |
|
|
|
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); |
|
|
|
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); |
|
|
|
|
|
|
|
|
|
|
|
printf( |
|
|
|
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " |
|
|
|
"\nNOTE: The CUDA Samples are not meant for performance measurements. " |
|
|
|
|
|
|
|
"Results may vary when GPU Boost is enabled.\n"); |
|
|
|
"Results may vary when GPU Boost is enabled.\n"); |
|
|
|
|
|
|
|
|
|
|
|
free(flush_buf); |
|
|
|
free(flush_buf); |
|
|
@ -156,7 +178,8 @@ int main(int argc, char **argv) { |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
// Parse args, run the appropriate tests |
|
|
|
// Parse args, run the appropriate tests |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
int runTest(const int argc, const char **argv) { |
|
|
|
int runTest(const int argc, const char **argv) |
|
|
|
|
|
|
|
{ |
|
|
|
int start = DEFAULT_SIZE; |
|
|
|
int start = DEFAULT_SIZE; |
|
|
|
int end = DEFAULT_SIZE; |
|
|
|
int end = DEFAULT_SIZE; |
|
|
|
int startDevice = 0; |
|
|
|
int startDevice = 0; |
|
|
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) { |
|
|
|
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { |
|
|
|
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { |
|
|
|
if (strcmp(memModeStr, "pageable") == 0) { |
|
|
|
if (strcmp(memModeStr, "pageable") == 0) { |
|
|
|
memMode = PAGEABLE; |
|
|
|
memMode = PAGEABLE; |
|
|
|
} else if (strcmp(memModeStr, "pinned") == 0) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (strcmp(memModeStr, "pinned") == 0) { |
|
|
|
memMode = PINNED; |
|
|
|
memMode = PINNED; |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
printf("Invalid memory mode - valid modes are pageable or pinned\n"); |
|
|
|
printf("Invalid memory mode - valid modes are pageable or pinned\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
return -1000; |
|
|
|
return -1000; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
// default - pinned memory |
|
|
|
// default - pinned memory |
|
|
|
memMode = PINNED; |
|
|
|
memMode = PINNED; |
|
|
|
} |
|
|
|
} |
|
|
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) { |
|
|
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); |
|
|
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); |
|
|
|
|
|
|
|
|
|
|
|
if (error_id != cudaSuccess) { |
|
|
|
if (error_id != cudaSuccess) { |
|
|
|
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, |
|
|
|
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); |
|
|
|
cudaGetErrorString(error_id)); |
|
|
|
|
|
|
|
exit(EXIT_FAILURE); |
|
|
|
exit(EXIT_FAILURE); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (strcmp(device, "all") == 0) { |
|
|
|
if (strcmp(device, "all") == 0) { |
|
|
|
printf( |
|
|
|
printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices " |
|
|
|
"\n!!!!!Cumulative Bandwidth to be computed from all the devices " |
|
|
|
|
|
|
|
"!!!!!!\n\n"); |
|
|
|
"!!!!!!\n\n"); |
|
|
|
startDevice = 0; |
|
|
|
startDevice = 0; |
|
|
|
endDevice = deviceCount - 1; |
|
|
|
endDevice = deviceCount - 1; |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
startDevice = endDevice = atoi(device); |
|
|
|
startDevice = endDevice = atoi(device); |
|
|
|
|
|
|
|
|
|
|
|
if (startDevice >= deviceCount || startDevice < 0) { |
|
|
|
if (startDevice >= deviceCount || startDevice < 0) { |
|
|
|
printf( |
|
|
|
printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be " |
|
|
|
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be " |
|
|
|
|
|
|
|
"used !!!!!\n", |
|
|
|
"used !!!!!\n", |
|
|
|
startDevice, 0); |
|
|
|
startDevice, |
|
|
|
|
|
|
|
0); |
|
|
|
startDevice = endDevice = 0; |
|
|
|
startDevice = endDevice = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) { |
|
|
|
|
|
|
|
|
|
|
|
printf("Running on...\n\n"); |
|
|
|
printf("Running on...\n\n"); |
|
|
|
|
|
|
|
|
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; |
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { |
|
|
|
currentDevice++) { |
|
|
|
|
|
|
|
cudaDeviceProp deviceProp; |
|
|
|
cudaDeviceProp deviceProp; |
|
|
|
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); |
|
|
|
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); |
|
|
|
|
|
|
|
|
|
|
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) { |
|
|
|
|
|
|
|
|
|
|
|
exit(EXIT_FAILURE); |
|
|
|
exit(EXIT_FAILURE); |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, |
|
|
|
else { |
|
|
|
cudaGetErrorString(error_id)); |
|
|
|
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); |
|
|
|
checkCudaErrors(cudaSetDevice(currentDevice)); |
|
|
|
checkCudaErrors(cudaSetDevice(currentDevice)); |
|
|
|
|
|
|
|
|
|
|
|
exit(EXIT_FAILURE); |
|
|
|
exit(EXIT_FAILURE); |
|
|
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) { |
|
|
|
if (strcmp(modeStr, "quick") == 0) { |
|
|
|
if (strcmp(modeStr, "quick") == 0) { |
|
|
|
printf(" Quick Mode\n\n"); |
|
|
|
printf(" Quick Mode\n\n"); |
|
|
|
mode = QUICK_MODE; |
|
|
|
mode = QUICK_MODE; |
|
|
|
} else if (strcmp(modeStr, "shmoo") == 0) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (strcmp(modeStr, "shmoo") == 0) { |
|
|
|
printf(" Shmoo Mode\n\n"); |
|
|
|
printf(" Shmoo Mode\n\n"); |
|
|
|
mode = SHMOO_MODE; |
|
|
|
mode = SHMOO_MODE; |
|
|
|
} else if (strcmp(modeStr, "range") == 0) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (strcmp(modeStr, "range") == 0) { |
|
|
|
printf(" Range Mode\n\n"); |
|
|
|
printf(" Range Mode\n\n"); |
|
|
|
mode = RANGE_MODE; |
|
|
|
mode = RANGE_MODE; |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
printf("Invalid mode - valid modes are quick, range, or shmoo\n"); |
|
|
|
printf("Invalid mode - valid modes are quick, range, or shmoo\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
return -3000; |
|
|
|
return -3000; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
// default mode - quick |
|
|
|
// default mode - quick |
|
|
|
printf(" Quick Mode\n\n"); |
|
|
|
printf(" Quick Mode\n\n"); |
|
|
|
mode = QUICK_MODE; |
|
|
|
mode = QUICK_MODE; |
|
|
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) { |
|
|
|
printf("Illegal argument - start must be greater than zero\n"); |
|
|
|
printf("Illegal argument - start must be greater than zero\n"); |
|
|
|
return -4000; |
|
|
|
return -4000; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
printf("Must specify a starting size in range mode\n"); |
|
|
|
printf("Must specify a starting size in range mode\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
return -5000; |
|
|
|
return -5000; |
|
|
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) { |
|
|
|
printf("Illegal argument - start is greater than end\n"); |
|
|
|
printf("Illegal argument - start is greater than end\n"); |
|
|
|
return -7000; |
|
|
|
return -7000; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
printf("Must specify an end size in range mode.\n"); |
|
|
|
printf("Must specify an end size in range mode.\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
return -8000; |
|
|
|
return -8000; |
|
|
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) { |
|
|
|
printf("Illegal argument - increment must be greater than zero\n"); |
|
|
|
printf("Illegal argument - increment must be greater than zero\n"); |
|
|
|
return -9000; |
|
|
|
return -9000; |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
printf("Must specify an increment in user mode\n"); |
|
|
|
printf("Must specify an increment in user mode\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
printf("See --help for more information\n"); |
|
|
|
return -10000; |
|
|
|
return -10000; |
|
|
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (htod) { |
|
|
|
if (htod) { |
|
|
|
testBandwidth((unsigned int)start, (unsigned int)end, |
|
|
|
testBandwidth((unsigned int)start, |
|
|
|
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode, |
|
|
|
(unsigned int)end, |
|
|
|
memMode, startDevice, endDevice, wc); |
|
|
|
(unsigned int)increment, |
|
|
|
|
|
|
|
mode, |
|
|
|
|
|
|
|
HOST_TO_DEVICE, |
|
|
|
|
|
|
|
printmode, |
|
|
|
|
|
|
|
memMode, |
|
|
|
|
|
|
|
startDevice, |
|
|
|
|
|
|
|
endDevice, |
|
|
|
|
|
|
|
wc); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (dtoh) { |
|
|
|
if (dtoh) { |
|
|
|
testBandwidth((unsigned int)start, (unsigned int)end, |
|
|
|
testBandwidth((unsigned int)start, |
|
|
|
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode, |
|
|
|
(unsigned int)end, |
|
|
|
memMode, startDevice, endDevice, wc); |
|
|
|
(unsigned int)increment, |
|
|
|
|
|
|
|
mode, |
|
|
|
|
|
|
|
DEVICE_TO_HOST, |
|
|
|
|
|
|
|
printmode, |
|
|
|
|
|
|
|
memMode, |
|
|
|
|
|
|
|
startDevice, |
|
|
|
|
|
|
|
endDevice, |
|
|
|
|
|
|
|
wc); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (dtod) { |
|
|
|
if (dtod) { |
|
|
|
testBandwidth((unsigned int)start, (unsigned int)end, |
|
|
|
testBandwidth((unsigned int)start, |
|
|
|
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, |
|
|
|
(unsigned int)end, |
|
|
|
memMode, startDevice, endDevice, wc); |
|
|
|
(unsigned int)increment, |
|
|
|
|
|
|
|
mode, |
|
|
|
|
|
|
|
DEVICE_TO_DEVICE, |
|
|
|
|
|
|
|
printmode, |
|
|
|
|
|
|
|
memMode, |
|
|
|
|
|
|
|
startDevice, |
|
|
|
|
|
|
|
endDevice, |
|
|
|
|
|
|
|
wc); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Ensure that we reset all CUDA Devices in question |
|
|
|
// Ensure that we reset all CUDA Devices in question |
|
|
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) { |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
// Run a bandwidth test |
|
|
|
// Run a bandwidth test |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, |
|
|
|
void testBandwidth(unsigned int start, |
|
|
|
testMode mode, memcpyKind kind, printMode printmode, |
|
|
|
unsigned int end, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, |
|
|
|
unsigned int increment, |
|
|
|
bool wc) { |
|
|
|
testMode mode, |
|
|
|
|
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
printMode printmode, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
switch (mode) { |
|
|
|
switch (mode) { |
|
|
|
case QUICK_MODE: |
|
|
|
case QUICK_MODE: |
|
|
|
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, |
|
|
|
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc); |
|
|
|
endDevice, wc); |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
case RANGE_MODE: |
|
|
|
case RANGE_MODE: |
|
|
|
testBandwidthRange(start, end, increment, kind, printmode, memMode, |
|
|
|
testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); |
|
|
|
startDevice, endDevice, wc); |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
case SHMOO_MODE: |
|
|
|
case SHMOO_MODE: |
|
|
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
// Run a quick mode bandwidth test |
|
|
|
// Run a quick mode bandwidth test |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, |
|
|
|
void testBandwidthQuick(unsigned int size, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, |
|
|
|
memcpyKind kind, |
|
|
|
bool wc) { |
|
|
|
printMode printmode, |
|
|
|
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, |
|
|
|
memoryMode memMode, |
|
|
|
startDevice, endDevice, wc); |
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////// |
|
|
|
// Run a range mode bandwidth test |
|
|
|
// Run a range mode bandwidth test |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////// |
|
|
|
void testBandwidthRange(unsigned int start, unsigned int end, |
|
|
|
void testBandwidthRange(unsigned int start, |
|
|
|
unsigned int increment, memcpyKind kind, |
|
|
|
unsigned int end, |
|
|
|
printMode printmode, memoryMode memMode, |
|
|
|
unsigned int increment, |
|
|
|
int startDevice, int endDevice, bool wc) { |
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
printMode printmode, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
// count the number of copies we're going to run |
|
|
|
// count the number of copies we're going to run |
|
|
|
unsigned int count = 1 + ((end - start) / increment); |
|
|
|
unsigned int count = 1 + ((end - start) / increment); |
|
|
|
|
|
|
|
|
|
|
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Use the device asked by the user |
|
|
|
// Use the device asked by the user |
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; |
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { |
|
|
|
currentDevice++) { |
|
|
|
|
|
|
|
cudaSetDevice(currentDevice); |
|
|
|
cudaSetDevice(currentDevice); |
|
|
|
|
|
|
|
|
|
|
|
// run each of the copies |
|
|
|
// run each of the copies |
|
|
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end, |
|
|
|
|
|
|
|
|
|
|
|
// print results |
|
|
|
// print results |
|
|
|
if (printmode == CSV) { |
|
|
|
if (printmode == CSV) { |
|
|
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, |
|
|
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); |
|
|
|
(1 + endDevice - startDevice), wc); |
|
|
|
} |
|
|
|
} else { |
|
|
|
else { |
|
|
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, |
|
|
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); |
|
|
|
(1 + endDevice - startDevice), wc); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// clean up |
|
|
|
// clean up |
|
|
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end, |
|
|
|
////////////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////////////// |
|
|
|
// Intense shmoo mode - covers a large range of values with varying increments |
|
|
|
// Intense shmoo mode - covers a large range of values with varying increments |
|
|
|
////////////////////////////////////////////////////////////////////////////// |
|
|
|
////////////////////////////////////////////////////////////////////////////// |
|
|
|
void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
void testBandwidthShmoo(memcpyKind kind, |
|
|
|
memoryMode memMode, int startDevice, int endDevice, |
|
|
|
printMode printmode, |
|
|
|
bool wc) { |
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int startDevice, |
|
|
|
|
|
|
|
int endDevice, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
// count the number of copies to make |
|
|
|
// count the number of copies to make |
|
|
|
unsigned int count = |
|
|
|
unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) |
|
|
|
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + |
|
|
|
+ ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) |
|
|
|
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + |
|
|
|
+ ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) |
|
|
|
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + |
|
|
|
+ ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) |
|
|
|
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + |
|
|
|
+ ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) |
|
|
|
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + |
|
|
|
+ ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) |
|
|
|
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + |
|
|
|
+ ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); |
|
|
|
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); |
|
|
|
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); |
|
|
|
double *bandwidths = (double *)malloc(count * sizeof(double)); |
|
|
|
double *bandwidths = (double *)malloc(count * sizeof(double)); |
|
|
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Use the device asked by the user |
|
|
|
// Use the device asked by the user |
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; |
|
|
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { |
|
|
|
currentDevice++) { |
|
|
|
|
|
|
|
cudaSetDevice(currentDevice); |
|
|
|
cudaSetDevice(currentDevice); |
|
|
|
// Run the shmoo |
|
|
|
// Run the shmoo |
|
|
|
int iteration = 0; |
|
|
|
int iteration = 0; |
|
|
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
while (memSize <= SHMOO_MEMSIZE_MAX) { |
|
|
|
while (memSize <= SHMOO_MEMSIZE_MAX) { |
|
|
|
if (memSize < SHMOO_LIMIT_20KB) { |
|
|
|
if (memSize < SHMOO_LIMIT_20KB) { |
|
|
|
memSize += SHMOO_INCREMENT_1KB; |
|
|
|
memSize += SHMOO_INCREMENT_1KB; |
|
|
|
} else if (memSize < SHMOO_LIMIT_50KB) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memSize < SHMOO_LIMIT_50KB) { |
|
|
|
memSize += SHMOO_INCREMENT_2KB; |
|
|
|
memSize += SHMOO_INCREMENT_2KB; |
|
|
|
} else if (memSize < SHMOO_LIMIT_100KB) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memSize < SHMOO_LIMIT_100KB) { |
|
|
|
memSize += SHMOO_INCREMENT_10KB; |
|
|
|
memSize += SHMOO_INCREMENT_10KB; |
|
|
|
} else if (memSize < SHMOO_LIMIT_1MB) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memSize < SHMOO_LIMIT_1MB) { |
|
|
|
memSize += SHMOO_INCREMENT_100KB; |
|
|
|
memSize += SHMOO_INCREMENT_100KB; |
|
|
|
} else if (memSize < SHMOO_LIMIT_16MB) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memSize < SHMOO_LIMIT_16MB) { |
|
|
|
memSize += SHMOO_INCREMENT_1MB; |
|
|
|
memSize += SHMOO_INCREMENT_1MB; |
|
|
|
} else if (memSize < SHMOO_LIMIT_32MB) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memSize < SHMOO_LIMIT_32MB) { |
|
|
|
memSize += SHMOO_INCREMENT_2MB; |
|
|
|
memSize += SHMOO_INCREMENT_2MB; |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
memSize += SHMOO_INCREMENT_4MB; |
|
|
|
memSize += SHMOO_INCREMENT_4MB; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
|
|
|
|
|
|
|
|
switch (kind) { |
|
|
|
switch (kind) { |
|
|
|
case DEVICE_TO_HOST: |
|
|
|
case DEVICE_TO_HOST: |
|
|
|
bandwidths[iteration] += |
|
|
|
bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc); |
|
|
|
testDeviceToHostTransfer(memSizes[iteration], memMode, wc); |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
case HOST_TO_DEVICE: |
|
|
|
case HOST_TO_DEVICE: |
|
|
|
bandwidths[iteration] += |
|
|
|
bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc); |
|
|
|
testHostToDeviceTransfer(memSizes[iteration], memMode, wc); |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
case DEVICE_TO_DEVICE: |
|
|
|
case DEVICE_TO_DEVICE: |
|
|
|
bandwidths[iteration] += |
|
|
|
bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]); |
|
|
|
testDeviceToDeviceTransfer(memSizes[iteration]); |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
printf("\n"); |
|
|
|
printf("\n"); |
|
|
|
|
|
|
|
|
|
|
|
if (CSV == printmode) { |
|
|
|
if (CSV == printmode) { |
|
|
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, |
|
|
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); |
|
|
|
(1 + endDevice - startDevice), wc); |
|
|
|
} |
|
|
|
} else { |
|
|
|
else { |
|
|
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, |
|
|
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); |
|
|
|
(1 + endDevice - startDevice), wc); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// clean up |
|
|
|
// clean up |
|
|
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode, |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
// test the bandwidth of a device to host memcopy of a specific size |
|
|
|
// test the bandwidth of a device to host memcopy of a specific size |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) |
|
|
|
bool wc) { |
|
|
|
{ |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
#if CUDART_VERSION >= 2020 |
|
|
|
#if CUDART_VERSION >= 2020 |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
(wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, |
|
|
|
|
|
|
|
(wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
|
|
|
|
#else |
|
|
|
#else |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
// pageable memory mode - use malloc |
|
|
|
// pageable memory mode - use malloc |
|
|
|
h_idata = (unsigned char *)malloc(memSize); |
|
|
|
h_idata = (unsigned char *)malloc(memSize); |
|
|
|
h_odata = (unsigned char *)malloc(memSize); |
|
|
|
h_odata = (unsigned char *)malloc(memSize); |
|
|
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); |
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); |
|
|
|
|
|
|
|
|
|
|
|
// initialize the device memory |
|
|
|
// initialize the device memory |
|
|
|
checkCudaErrors( |
|
|
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// copy data from GPU to Host |
|
|
|
// copy data from GPU to Host |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (bDontUseGPUTiming) sdkStartTimer(&timer); |
|
|
|
if (bDontUseGPUTiming) |
|
|
|
|
|
|
|
sdkStartTimer(&timer); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, |
|
|
|
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0)); |
|
|
|
cudaMemcpyDeviceToHost, 0)); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
|
checkCudaErrors(cudaDeviceSynchronize()); |
|
|
|
checkCudaErrors(cudaDeviceSynchronize()); |
|
|
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
elapsedTimeInMs = sdkGetTimerValue(&timer); |
|
|
|
elapsedTimeInMs = sdkGetTimerValue(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
elapsedTimeInMs = 0; |
|
|
|
elapsedTimeInMs = 0; |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
sdkStartTimer(&timer); |
|
|
|
sdkStartTimer(&timer); |
|
|
|
checkCudaErrors( |
|
|
|
checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); |
|
|
|
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); |
|
|
|
|
|
|
|
sdkStopTimer(&timer); |
|
|
|
sdkStopTimer(&timer); |
|
|
|
elapsedTimeInMs += sdkGetTimerValue(&timer); |
|
|
|
elapsedTimeInMs += sdkGetTimerValue(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
checkCudaErrors(cudaFreeHost(h_idata)); |
|
|
|
checkCudaErrors(cudaFreeHost(h_idata)); |
|
|
|
checkCudaErrors(cudaFreeHost(h_odata)); |
|
|
|
checkCudaErrors(cudaFreeHost(h_odata)); |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
free(h_idata); |
|
|
|
free(h_idata); |
|
|
|
free(h_odata); |
|
|
|
free(h_odata); |
|
|
|
} |
|
|
|
} |
|
|
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
//! test the bandwidth of a host to device memcopy of a specific size |
|
|
|
//! test the bandwidth of a host to device memcopy of a specific size |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) |
|
|
|
bool wc) { |
|
|
|
{ |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
#if CUDART_VERSION >= 2020 |
|
|
|
#if CUDART_VERSION >= 2020 |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, |
|
|
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
(wc) ? cudaHostAllocWriteCombined : 0)); |
|
|
|
|
|
|
|
#else |
|
|
|
#else |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
// pinned memory mode - use special function to get OS-pinned memory |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); |
|
|
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
// pageable memory mode - use malloc |
|
|
|
// pageable memory mode - use malloc |
|
|
|
h_odata = (unsigned char *)malloc(memSize); |
|
|
|
h_odata = (unsigned char *)malloc(memSize); |
|
|
|
|
|
|
|
|
|
|
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
|
|
|
|
|
|
|
|
// copy host memory to device memory |
|
|
|
// copy host memory to device memory |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
if (bDontUseGPUTiming) sdkStartTimer(&timer); |
|
|
|
if (bDontUseGPUTiming) |
|
|
|
|
|
|
|
sdkStartTimer(&timer); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, |
|
|
|
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0)); |
|
|
|
cudaMemcpyHostToDevice, 0)); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
|
checkCudaErrors(cudaDeviceSynchronize()); |
|
|
|
checkCudaErrors(cudaDeviceSynchronize()); |
|
|
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
elapsedTimeInMs = sdkGetTimerValue(&timer); |
|
|
|
elapsedTimeInMs = sdkGetTimerValue(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
elapsedTimeInMs = 0; |
|
|
|
elapsedTimeInMs = 0; |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
sdkStartTimer(&timer); |
|
|
|
sdkStartTimer(&timer); |
|
|
|
checkCudaErrors( |
|
|
|
checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
|
|
|
|
sdkStopTimer(&timer); |
|
|
|
sdkStopTimer(&timer); |
|
|
|
elapsedTimeInMs += sdkGetTimerValue(&timer); |
|
|
|
elapsedTimeInMs += sdkGetTimerValue(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
|
sdkResetTimer(&timer); |
|
|
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
|
|
|
|
|
|
|
|
if (PINNED == memMode) { |
|
|
|
if (PINNED == memMode) { |
|
|
|
checkCudaErrors(cudaFreeHost(h_odata)); |
|
|
|
checkCudaErrors(cudaFreeHost(h_odata)); |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
free(h_odata); |
|
|
|
free(h_odata); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
//! test the bandwidth of a device to device memcopy of a specific size |
|
|
|
//! test the bandwidth of a device to device memcopy of a specific size |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////////// |
|
|
|
float testDeviceToDeviceTransfer(unsigned int memSize) { |
|
|
|
float testDeviceToDeviceTransfer(unsigned int memSize) |
|
|
|
|
|
|
|
{ |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
StopWatchInterface *timer = NULL; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float elapsedTimeInMs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
|
float bandwidthInGBs = 0.0f; |
|
|
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) { |
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); |
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); |
|
|
|
|
|
|
|
|
|
|
|
// initialize memory |
|
|
|
// initialize memory |
|
|
|
checkCudaErrors( |
|
|
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// run the memcopy |
|
|
|
// run the memcopy |
|
|
|
sdkStartTimer(&timer); |
|
|
|
sdkStartTimer(&timer); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(start, 0)); |
|
|
|
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { |
|
|
|
checkCudaErrors( |
|
|
|
checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); |
|
|
|
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0)); |
|
|
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) { |
|
|
|
///////////////////////////////////////////////////////// |
|
|
|
///////////////////////////////////////////////////////// |
|
|
|
// print results in an easily read format |
|
|
|
// print results in an easily read format |
|
|
|
//////////////////////////////////////////////////////// |
|
|
|
//////////////////////////////////////////////////////// |
|
|
|
void printResultsReadable(unsigned int *memSizes, double *bandwidths, |
|
|
|
void printResultsReadable(unsigned int *memSizes, |
|
|
|
unsigned int count, memcpyKind kind, |
|
|
|
double *bandwidths, |
|
|
|
memoryMode memMode, int iNumDevs, bool wc) { |
|
|
|
unsigned int count, |
|
|
|
|
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int iNumDevs, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); |
|
|
|
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); |
|
|
|
printf(" %s Memory Transfers\n", sMemoryMode[memMode]); |
|
|
|
printf(" %s Memory Transfers\n", sMemoryMode[memMode]); |
|
|
|
|
|
|
|
|
|
|
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths, |
|
|
|
unsigned int i; |
|
|
|
unsigned int i; |
|
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < (count - 1); i++) { |
|
|
|
for (i = 0; i < (count - 1); i++) { |
|
|
|
printf(" %u\t\t\t%s%.1f\n", memSizes[i], |
|
|
|
printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); |
|
|
|
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], |
|
|
|
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); |
|
|
|
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
// print results in a database format |
|
|
|
// print results in a database format |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
void printResultsCSV(unsigned int *memSizes, double *bandwidths, |
|
|
|
void printResultsCSV(unsigned int *memSizes, |
|
|
|
unsigned int count, memcpyKind kind, memoryMode memMode, |
|
|
|
double *bandwidths, |
|
|
|
int iNumDevs, bool wc) { |
|
|
|
unsigned int count, |
|
|
|
|
|
|
|
memcpyKind kind, |
|
|
|
|
|
|
|
memoryMode memMode, |
|
|
|
|
|
|
|
int iNumDevs, |
|
|
|
|
|
|
|
bool wc) |
|
|
|
|
|
|
|
{ |
|
|
|
std::string sConfig; |
|
|
|
std::string sConfig; |
|
|
|
|
|
|
|
|
|
|
|
// log config information |
|
|
|
// log config information |
|
|
|
if (kind == DEVICE_TO_DEVICE) { |
|
|
|
if (kind == DEVICE_TO_DEVICE) { |
|
|
|
sConfig += "D2D"; |
|
|
|
sConfig += "D2D"; |
|
|
|
} else { |
|
|
|
} |
|
|
|
|
|
|
|
else { |
|
|
|
if (kind == DEVICE_TO_HOST) { |
|
|
|
if (kind == DEVICE_TO_HOST) { |
|
|
|
sConfig += "D2H"; |
|
|
|
sConfig += "D2H"; |
|
|
|
} else if (kind == HOST_TO_DEVICE) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (kind == HOST_TO_DEVICE) { |
|
|
|
sConfig += "H2D"; |
|
|
|
sConfig += "H2D"; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (memMode == PAGEABLE) { |
|
|
|
if (memMode == PAGEABLE) { |
|
|
|
sConfig += "-Paged"; |
|
|
|
sConfig += "-Paged"; |
|
|
|
} else if (memMode == PINNED) { |
|
|
|
} |
|
|
|
|
|
|
|
else if (memMode == PINNED) { |
|
|
|
sConfig += "-Pinned"; |
|
|
|
sConfig += "-Pinned"; |
|
|
|
|
|
|
|
|
|
|
|
if (wc) { |
|
|
|
if (wc) { |
|
|
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths, |
|
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++) { |
|
|
|
for (i = 0; i < count; i++) { |
|
|
|
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); |
|
|
|
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); |
|
|
|
printf( |
|
|
|
printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " |
|
|
|
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " |
|
|
|
|
|
|
|
"bytes, NumDevsUsed = %d\n", |
|
|
|
"bytes, NumDevsUsed = %d\n", |
|
|
|
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs); |
|
|
|
sConfig.c_str(), |
|
|
|
|
|
|
|
bandwidths[i], |
|
|
|
|
|
|
|
dSeconds, |
|
|
|
|
|
|
|
memSizes[i], |
|
|
|
|
|
|
|
iNumDevs); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
// Print help screen |
|
|
|
// Print help screen |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
/////////////////////////////////////////////////////////////////////////// |
|
|
|
void printHelp(void) { |
|
|
|
void printHelp(void) |
|
|
|
|
|
|
|
{ |
|
|
|
printf("Usage: bandwidthTest [OPTION]...\n"); |
|
|
|
printf("Usage: bandwidthTest [OPTION]...\n"); |
|
|
|
printf( |
|
|
|
printf("Test the bandwidth for device to host, host to device, and device to " |
|
|
|
"Test the bandwidth for device to host, host to device, and device to " |
|
|
|
|
|
|
|
"device transfers\n"); |
|
|
|
"device transfers\n"); |
|
|
|
printf("\n"); |
|
|
|
printf("\n"); |
|
|
|
printf( |
|
|
|
printf("Example: measure the bandwidth of device to host pinned memory copies " |
|
|
|
"Example: measure the bandwidth of device to host pinned memory copies " |
|
|
|
|
|
|
|
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); |
|
|
|
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); |
|
|
|
printf( |
|
|
|
printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " |
|
|
|
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " |
|
|
|
|
|
|
|
"--increment=1024 --dtoh\n"); |
|
|
|
"--increment=1024 --dtoh\n"); |
|
|
|
|
|
|
|
|
|
|
|
printf("\n"); |
|
|
|
printf("\n"); |
|
|
|