Browse Source

Port reduction sample code to new error handling

divergence
Nicholas Wilt 9 years ago
parent
commit
096e5506f0
  1. 32
      reduction/reduction.cu
  2. 32
      reduction/reductionTemplated.cu

32
reduction/reduction.cu

@ -81,13 +81,13 @@ TimedReduction( @@ -81,13 +81,13 @@ TimedReduction(
cudaEvent_t stop = 0;
cudaError_t status;
CUDART_CHECK( cudaMalloc( &deviceAnswer, sizeof(int) ) );
CUDART_CHECK( cudaMalloc( &partialSums, cBlocks*sizeof(int) ) );
CUDART_CHECK( cudaEventCreate( &start ) );
CUDART_CHECK( cudaEventCreate( &stop ) );
CUDART_CHECK( cudaThreadSynchronize() );
cuda(Malloc( &deviceAnswer, sizeof(int) ) );
cuda(Malloc( &partialSums, cBlocks*sizeof(int) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(ThreadSynchronize() );
CUDART_CHECK( cudaEventRecord( start, 0 ) );
cuda(EventRecord( start, 0 ) );
hostReduction(
deviceAnswer,
partialSums,
@ -95,8 +95,8 @@ TimedReduction( @@ -95,8 +95,8 @@ TimedReduction(
N,
cBlocks,
cThreads );
CUDART_CHECK( cudaEventRecord( stop, 0 ) );
CUDART_CHECK( cudaMemcpy(
cuda(EventRecord( stop, 0 ) );
cuda(Memcpy(
answer,
deviceAnswer,
sizeof(int),
@ -159,13 +159,13 @@ usPerInvocation( int cIterations, size_t N, @@ -159,13 +159,13 @@ usPerInvocation( int cIterations, size_t N,
double ret = 0.0f;
chTimerTimestamp start, stop;
CUDART_CHECK( cudaMalloc( &smallArray, N*sizeof(int) ) );
CUDART_CHECK( cudaMalloc( &partialSums, 1*sizeof(int) ) );
cuda(Malloc( &smallArray, N*sizeof(int) ) );
cuda(Malloc( &partialSums, 1*sizeof(int) ) );
chTimerGetTime( &start );
for ( int i = 0; i < cIterations; i++ ) {
pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
}
CUDART_CHECK( cudaThreadSynchronize() );
cuda(ThreadSynchronize() );
chTimerGetTime( &stop );
ret = chTimerElapsedTime( &start, &stop );
ret = (ret / (double) cIterations) * 1e6;
@ -209,10 +209,10 @@ main( int argc, char *argv[] ) @@ -209,10 +209,10 @@ main( int argc, char *argv[] )
hostData = (int *) malloc( cInts*sizeof(int) );
if ( ! hostData )
goto Error;
CUDART_CHECK( cudaSetDevice( device ) );
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaMalloc( &deviceData, cInts*sizeof(int) ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, 0 ) );
cuda(SetDevice( device ) );
cuda(SetDeviceFlags( cudaDeviceMapHost ) );
cuda(Malloc( &deviceData, cInts*sizeof(int) ) );
cuda(GetDeviceProperties( &props, 0 ) );
sum = 0;
for ( size_t i = 0; i < cInts; i++ ) {
@ -220,7 +220,7 @@ main( int argc, char *argv[] ) @@ -220,7 +220,7 @@ main( int argc, char *argv[] )
sum += value;
hostData[i] = value;
}
CUDART_CHECK( cudaMemcpy( deviceData, hostData, cInts*sizeof(int),
cuda(Memcpy( deviceData, hostData, cInts*sizeof(int),
cudaMemcpyHostToDevice ) );
{

32
reduction/reductionTemplated.cu

@ -125,18 +125,18 @@ TimedReduction( @@ -125,18 +125,18 @@ TimedReduction(
cudaEvent_t stop = 0;
cudaError_t status;
CUDART_CHECK( cudaMalloc( &deviceAnswer, sizeof(ReductionType) ) );
CUDART_CHECK( cudaMalloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
CUDART_CHECK( cudaEventCreate( &start ) );
CUDART_CHECK( cudaEventCreate( &stop ) );
CUDART_CHECK( cudaThreadSynchronize() );
cuda(Malloc( &deviceAnswer, sizeof(ReductionType) ) );
cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(ThreadSynchronize() );
CUDART_CHECK( cudaEventRecord( start, 0 ) );
cuda(EventRecord( start, 0 ) );
hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads );
CUDART_CHECK( cudaEventRecord( stop, 0 ) );
CUDART_CHECK( cudaMemcpy( answer, deviceAnswer, sizeof(T), cudaMemcpyDeviceToHost ) );
cuda(EventRecord( stop, 0 ) );
cuda(Memcpy( answer, deviceAnswer, sizeof(T), cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaEventElapsedTime( &ms, start, stop ) )
cuda(EventElapsedTime( &ms, start, stop ) )
ret = ms * 1000.0f;
// fall through to free resources before returning
@ -203,13 +203,13 @@ usPerInvocation( int cIterations, size_t N, @@ -203,13 +203,13 @@ usPerInvocation( int cIterations, size_t N,
double ret = 0.0f;
chTimerTimestamp start, stop;
CUDART_CHECK( cudaMalloc( &smallArray, N*sizeof(T) ) );
CUDART_CHECK( cudaMalloc( &partialSums, 1*sizeof(ReductionType) ) );
cuda(Malloc( &smallArray, N*sizeof(T) ) );
cuda(Malloc( &partialSums, 1*sizeof(ReductionType) ) );
chTimerGetTime( &start );
for ( int i = 0; i < cIterations; i++ ) {
pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
}
CUDART_CHECK( cudaThreadSynchronize() );
cuda(ThreadSynchronize() );
chTimerGetTime( &stop );
ret = chTimerElapsedTime( &start, &stop );
ret = (ret / (double) cIterations) * 1e6;
@ -246,10 +246,10 @@ ShmooReport( size_t N, bool bFloat ) @@ -246,10 +246,10 @@ ShmooReport( size_t N, bool bFloat )
hostData = (T *) malloc( N*sizeof(T) );
if ( ! hostData )
goto Error;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaMalloc( &deviceData, N*sizeof(T) ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, 0 ) );
CUDART_CHECK( cudaMemcpy( deviceData, hostData, N*sizeof(T), cudaMemcpyHostToDevice ) );
cuda(SetDeviceFlags( cudaDeviceMapHost ) );
cuda(Malloc( &deviceData, N*sizeof(T) ) );
cuda(GetDeviceProperties( &props, 0 ) );
cuda(Memcpy( deviceData, hostData, N*sizeof(T), cudaMemcpyHostToDevice ) );
forkPrint( g_fileShmoo, "N\tThreads(1)\tus(1)\t"
"Threads(2)\tus(2)\t"

Loading…
Cancel
Save