|
|
|
@ -109,11 +109,11 @@ BandwidthCopy( T *deviceOut, T *deviceIn,
@@ -109,11 +109,11 @@ BandwidthCopy( T *deviceOut, T *deviceIn,
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
cIterations = 10; |
|
|
|
|
cudaEventRecord( evStart ); |
|
|
|
|
cuda(EventRecord( evStart )); |
|
|
|
|
for ( int i = 0; i < cIterations; i++ ) { |
|
|
|
|
GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn+bOffsetSrc, N-bOffsetDst-bOffsetSrc ); |
|
|
|
|
} |
|
|
|
|
cudaEventRecord( evStop ); |
|
|
|
|
cuda(EventRecord( evStop )); |
|
|
|
|
cuda(DeviceSynchronize() ); |
|
|
|
|
// make configurations that cannot launch error-out with 0 bandwidth |
|
|
|
|
cuda(GetLastError() ); |
|
|
|
@ -217,10 +217,12 @@ main( int argc, char *argv[] )
@@ -217,10 +217,12 @@ main( int argc, char *argv[] )
|
|
|
|
|
{ |
|
|
|
|
int device = 0; |
|
|
|
|
int size = 16; |
|
|
|
|
cudaError_t status; |
|
|
|
|
|
|
|
|
|
if ( chCommandLineGet( &device, "device", argc, argv ) ) { |
|
|
|
|
printf( "Using device %d...\n", device ); |
|
|
|
|
} |
|
|
|
|
cudaSetDevice(device); |
|
|
|
|
cuda(SetDevice(device)); |
|
|
|
|
if ( chCommandLineGet( &size, "size", argc, argv ) ) { |
|
|
|
|
printf( "Using %dM operands ...\n", size ); |
|
|
|
|
} |
|
|
|
@ -261,4 +263,6 @@ main( int argc, char *argv[] )
@@ -261,4 +263,6 @@ main( int argc, char *argv[] )
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return 0; |
|
|
|
|
Error: |
|
|
|
|
return 1; |
|
|
|
|
} |
|
|
|
|