Browse Source

Ported nbody to new error handling

divergence
Nicholas Wilt 9 years ago
parent
commit
fd03ad3694
  1. 58
      nbody/nbody.cu
  2. 12
      nbody/nbody_GPU_AOS.cuh
  3. 16
      nbody/nbody_GPU_AOS_const.cuh
  4. 20
      nbody/nbody_GPU_AOS_tiled.cuh
  5. 22
      nbody/nbody_GPU_AOS_tiled_const.cuh
  6. 18
      nbody/nbody_GPU_Atomic.cuh
  7. 32
      nbody/nbody_GPU_SOA_tiled.cuh
  8. 16
      nbody/nbody_GPU_Shuffle.cuh
  9. 16
      nbody/nbody_GPU_shared.cu
  10. 22
      nbody/nbody_multiGPU.cu
  11. 8
      nbody/nbody_multiGPU_threaded.cu

58
nbody/nbody.cu

@ -267,7 +267,7 @@ ComputeGravitation(
// CPU->GPU copies in case we are measuring GPU performance // CPU->GPU copies in case we are measuring GPU performance
if ( g_bCUDAPresent ) { if ( g_bCUDAPresent ) {
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_hostAOS_PosMass, g_hostAOS_PosMass,
4*g_N*sizeof(float), 4*g_N*sizeof(float),
@ -338,7 +338,7 @@ ComputeGravitation(
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
case GPU_AOS_tiled: case GPU_AOS_tiled:
*ms = ComputeGravitation_GPU_AOS_tiled( *ms = ComputeGravitation_GPU_AOS_tiled(
@ -346,7 +346,7 @@ ComputeGravitation(
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
case GPU_AOS_tiled_const: case GPU_AOS_tiled_const:
*ms = ComputeGravitation_GPU_AOS_tiled_const( *ms = ComputeGravitation_GPU_AOS_tiled_const(
@ -354,46 +354,46 @@ ComputeGravitation(
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
#if 0 #if 0
// commented out - too slow even on SM 3.0 // commented out - too slow even on SM 3.0
case GPU_Atomic: case GPU_Atomic:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*sizeof(float) ) ); cuda(Memset( g_dptrAOS_Force, 0, 3*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Atomic( *ms = ComputeGravitation_GPU_Atomic(
g_dptrAOS_Force, g_dptrAOS_Force,
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
#endif #endif
case GPU_Shared: case GPU_Shared:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) ); cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Shared( *ms = ComputeGravitation_GPU_Shared(
g_dptrAOS_Force, g_dptrAOS_Force,
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
case GPU_Const: case GPU_Const:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) ); cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeNBodyGravitation_GPU_AOS_const( *ms = ComputeNBodyGravitation_GPU_AOS_const(
g_dptrAOS_Force, g_dptrAOS_Force,
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
case GPU_Shuffle: case GPU_Shuffle:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) ); cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Shuffle( *ms = ComputeGravitation_GPU_Shuffle(
g_dptrAOS_Force, g_dptrAOS_Force,
g_dptrAOS_PosMass, g_dptrAOS_PosMass,
g_softening*g_softening, g_softening*g_softening,
g_N ); g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break; break;
case multiGPU_SingleCPUThread: case multiGPU_SingleCPUThread:
memset( g_hostAOS_Force, 0, 3*g_N*sizeof(float) ); memset( g_hostAOS_Force, 0, 3*g_N*sizeof(float) );
@ -516,9 +516,9 @@ initializeGPU( void *_p )
cudaError_t status; cudaError_t status;
gpuInit_struct *p = (gpuInit_struct *) _p; gpuInit_struct *p = (gpuInit_struct *) _p;
CUDART_CHECK( cudaSetDevice( p->iGPU ) ); cuda(SetDevice( p->iGPU ) );
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); cuda(SetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaFree(0) ); cuda(Free(0) );
Error: Error:
p->status = status; p->status = status;
} }
@ -560,7 +560,7 @@ main( int argc, char *argv[] )
g_bCUDAPresent = (cudaSuccess == status) && (g_numGPUs > 0); g_bCUDAPresent = (cudaSuccess == status) && (g_numGPUs > 0);
if ( g_bCUDAPresent ) { if ( g_bCUDAPresent ) {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDART_CHECK( cudaGetDeviceProperties( &prop, 0 ) ); cuda(GetDeviceProperties( &prop, 0 ) );
g_bSM30Present = prop.major >= 3; g_bSM30Present = prop.major >= 3;
} }
g_bNoCPU = chCommandLineGetBool( "nocpu", argc, argv ); g_bNoCPU = chCommandLineGetBool( "nocpu", argc, argv );
@ -711,27 +711,27 @@ main( int argc, char *argv[] )
if ( g_bCUDAPresent ) { if ( g_bCUDAPresent ) {
cudaDeviceProp propForVersion; cudaDeviceProp propForVersion;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); cuda(SetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaGetDeviceProperties( &propForVersion, 0 ) ); cuda(GetDeviceProperties( &propForVersion, 0 ) );
if ( propForVersion.major < 3 ) { if ( propForVersion.major < 3 ) {
// Only SM 3.x supports shuffle and fast atomics, so we cannot run // Only SM 3.x supports shuffle and fast atomics, so we cannot run
// some algorithms on this board. // some algorithms on this board.
g_maxAlgorithm = multiGPU_MultiCPUThread; g_maxAlgorithm = multiGPU_MultiCPUThread;
} }
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_PosMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostAOS_PosMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
for ( int i = 0; i < 3; i++ ) { for ( int i = 0; i < 3; i++ ) {
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Pos[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostSOA_Pos[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Force[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostSOA_Force[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
} }
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_Force, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostAOS_Force, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_Force_Golden, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostAOS_Force_Golden, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_VelInvMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostAOS_VelInvMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Mass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostSOA_Mass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_InvMass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) &g_hostSOA_InvMass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaMalloc( &g_dptrAOS_PosMass, 4*g_N*sizeof(float) ) ); cuda(Malloc( &g_dptrAOS_PosMass, 4*g_N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( (void **) &g_dptrAOS_Force, 3*g_N*sizeof(float) ) ); cuda(Malloc( (void **) &g_dptrAOS_Force, 3*g_N*sizeof(float) ) );
if ( g_bGPUCrossCheck ) { if ( g_bGPUCrossCheck ) {
printf( "GPU cross check enabled (%d GPUs), disabling CPU\n", g_numGPUs ); printf( "GPU cross check enabled (%d GPUs), disabling CPU\n", g_numGPUs );
@ -742,7 +742,7 @@ main( int argc, char *argv[] )
goto Error; goto Error;
} }
for ( int i = 0; i < g_numGPUs; i++ ) { for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaHostAlloc( (void **) (&g_hostAOS_gpuCrossCheckForce[i]), 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) ); cuda(HostAlloc( (void **) (&g_hostAOS_gpuCrossCheckForce[i]), 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
} }
} }
} }

12
nbody/nbody_GPU_AOS.cuh

@ -79,14 +79,14 @@ ComputeGravitation_GPU_AOS(
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0; float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_GPU_AOS<float> <<<300,256>>>( ComputeNBodyGravitation_GPU_AOS<float> <<<300,256>>>(
force, posMass, N, softeningSquared ); force, posMass, N, softeningSquared );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
cudaEventDestroy( evStop ); cudaEventDestroy( evStop );
cudaEventDestroy( evStart ); cudaEventDestroy( evStart );

16
nbody/nbody_GPU_AOS_const.cuh

@ -89,18 +89,18 @@ ComputeNBodyGravitation_GPU_AOS_const(
size_t bodiesLeft = N; size_t bodiesLeft = N;
void *p; void *p;
CUDART_CHECK( cudaGetSymbolAddress( &p, g_constantBodies ) ); cuda(GetSymbolAddress( &p, g_constantBodies ) );
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
for ( size_t i = 0; i < N; i += g_bodiesPerPass ) { for ( size_t i = 0; i < N; i += g_bodiesPerPass ) {
// bodiesThisPass = max(bodiesLeft, g_bodiesPerPass); // bodiesThisPass = max(bodiesLeft, g_bodiesPerPass);
size_t bodiesThisPass = bodiesLeft; size_t bodiesThisPass = bodiesLeft;
if ( bodiesThisPass > g_bodiesPerPass ) { if ( bodiesThisPass > g_bodiesPerPass ) {
bodiesThisPass = g_bodiesPerPass; bodiesThisPass = g_bodiesPerPass;
} }
CUDART_CHECK( cudaMemcpyToSymbolAsync( cuda(MemcpyToSymbolAsync(
g_constantBodies, g_constantBodies,
((float4 *) posMass)+i, ((float4 *) posMass)+i,
bodiesThisPass*sizeof(float4), bodiesThisPass*sizeof(float4),
@ -111,9 +111,9 @@ ComputeNBodyGravitation_GPU_AOS_const(
force, posMass, softeningSquared, bodiesThisPass, N ); force, posMass, softeningSquared, bodiesThisPass, N );
bodiesLeft -= bodiesThisPass; bodiesLeft -= bodiesThisPass;
} }
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
cudaEventDestroy( evStop ); cudaEventDestroy( evStop );
cudaEventDestroy( evStart ); cudaEventDestroy( evStart );

20
nbody/nbody_GPU_AOS_tiled.cuh

@ -186,9 +186,9 @@ ComputeGravitation_GPU_AOS_tiled(
cudaError_t status; cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 ); dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) ); cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_tiled<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared ); ComputeNBodyGravitation_GPU_tiled<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
Error: Error:
return status; return status;
} }
@ -204,20 +204,20 @@ ComputeGravitation_GPU_AOS_tiled(
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0; float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled<128>( CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled<128>(
force, force,
posMass, posMass,
softeningSquared, softeningSquared,
N ) ); N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }
#else #else

22
nbody/nbody_GPU_AOS_tiled_const.cuh

@ -303,9 +303,9 @@ ComputeGravitation_GPU_AOS_tiled_const(
cudaError_t status; cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 ); dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) ); cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_tiled_const<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared ); ComputeNBodyGravitation_GPU_tiled_const<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
Error: Error:
return status; return status;
} }
@ -321,20 +321,20 @@ ComputeGravitation_GPU_AOS_tiled_const(
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0; float ms = 0.0;
CUDART_CHECK( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) ); cuda(DeviceSetCacheConfig( cudaFuncCachePreferShared ) );
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled_const<32>( CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled_const<32>(
force, force,
posMass, posMass,
softeningSquared, softeningSquared,
N ) ); N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }

18
nbody/nbody_GPU_Atomic.cuh

@ -117,16 +117,16 @@ ComputeGravitation_GPU_Atomic(
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0; float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) ); cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_Atomic<float> <<<300,256>>>( force, posMass, N, softeningSquared ); ComputeNBodyGravitation_Atomic<float> <<<300,256>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }

32
nbody/nbody_GPU_SOA_tiled.cuh

@ -165,11 +165,11 @@ ComputeGravitation_GPU_SOA_tiled(
cudaError_t status; cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 ); dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( forces[0], 0, N*sizeof(float) ) ); cuda(Memset( forces[0], 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( forces[1], 0, N*sizeof(float) ) ); cuda(Memset( forces[1], 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( forces[2], 0, N*sizeof(float) ) ); cuda(Memset( forces[2], 0, N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_SOA_tiled<nTile><<<blocks,nTile>>>( forces[0], forces[1], forces[2], posMass, N, softeningSquared ); ComputeNBodyGravitation_GPU_SOA_tiled<nTile><<<blocks,nTile>>>( forces[0], forces[1], forces[2], posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
Error: Error:
return status; return status;
} }
@ -212,31 +212,31 @@ ComputeGravitation_GPU_SOA_tiled(
float ms = 0.0; float ms = 0.0;
float *forces[3] = {0}; float *forces[3] = {0};
CUDART_CHECK( cudaMalloc( &forces[0], N*sizeof(float) ) ); cuda(Malloc( &forces[0], N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &forces[1], N*sizeof(float) ) ); cuda(Malloc( &forces[1], N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &forces[2], N*sizeof(float) ) ); cuda(Malloc( &forces[2], N*sizeof(float) ) );
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
AOStoSOA_GPU_3<<<300,256>>>( forces[0], forces[1], forces[2], force, N ); AOStoSOA_GPU_3<<<300,256>>>( forces[0], forces[1], forces[2], force, N );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_SOA_tiled<128>( CUDART_CHECK( ComputeGravitation_GPU_SOA_tiled<128>(
forces, forces,
posMass, posMass,
softeningSquared, softeningSquared,
N ) ); N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
SOAtoAOS_GPU_3<<<300,256>>>( force, forces[0], forces[1], forces[2], N ); SOAtoAOS_GPU_3<<<300,256>>>( force, forces[0], forces[1], forces[2], N );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }

16
nbody/nbody_GPU_Shuffle.cuh

@ -117,15 +117,15 @@ ComputeGravitation_GPU_Shuffle( float *force, float *posMass, float softeningSqu
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0f; float ms = 0.0f;
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_Shuffle <<<300,256>>>( force, posMass, softeningSquared, N ); ComputeNBodyGravitation_Shuffle <<<300,256>>>( force, posMass, softeningSquared, N );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }

16
nbody/nbody_GPU_shared.cu

@ -89,19 +89,19 @@ ComputeGravitation_GPU_Shared(
cudaError_t status; cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0; cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0; float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) ); cuda(EventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) ); cuda(EventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) ); cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_Shared<<<300,256, 256*sizeof(float4)>>>( ComputeNBodyGravitation_Shared<<<300,256, 256*sizeof(float4)>>>(
force, force,
posMass, posMass,
softeningSquared, softeningSquared,
N ); N );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) ); cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) ); cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error: Error:
CUDART_CHECK( cudaEventDestroy( evStop ) ); cuda(EventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) ); cuda(EventDestroy( evStart ) );
return ms; return ms;
} }

22
nbody/nbody_multiGPU.cu

@ -89,25 +89,25 @@ ComputeGravitation_multiGPU_singlethread(
if ( (0 != N % g_numGPUs) || (g_numGPUs > g_maxGPUs) ) { if ( (0 != N % g_numGPUs) || (g_numGPUs > g_maxGPUs) ) {
return 0.0f; return 0.0f;
} }
CUDART_CHECK( cudaGetDevice( &oldDevice ) ); cuda(GetDevice( &oldDevice ) );
// kick off the asynchronous memcpy's - overlap GPUs pulling // kick off the asynchronous memcpy's - overlap GPUs pulling
// host memory with the CPU time needed to do the memory // host memory with the CPU time needed to do the memory
// allocations. // allocations.
for ( int i = 0; i < g_numGPUs; i++ ) { for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) ); cuda(SetDevice( i ) );
CUDART_CHECK( cudaMalloc( &dptrPosMass[i], 4*N*sizeof(float) ) ); cuda(Malloc( &dptrPosMass[i], 4*N*sizeof(float) ) );
// we only need 3*N floatsw for the cross-check. otherwise we // we only need 3*N floatsw for the cross-check. otherwise we
// would need 3*bodiesPerGPU // would need 3*bodiesPerGPU
CUDART_CHECK( cudaMalloc( &dptrForce[i], 3*N*sizeof(float) ) ); cuda(Malloc( &dptrForce[i], 3*N*sizeof(float) ) );
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
dptrPosMass[i], dptrPosMass[i],
g_hostAOS_PosMass, g_hostAOS_PosMass,
4*N*sizeof(float), 4*N*sizeof(float),
cudaMemcpyHostToDevice ) ); cudaMemcpyHostToDevice ) );
} }
for ( int i = 0; i < g_numGPUs; i++ ) { for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) ); cuda(SetDevice( i ) );
if ( g_bGPUCrossCheck ) { if ( g_bGPUCrossCheck ) {
ComputeNBodyGravitation_multiGPU_onethread<<<300,256,256*sizeof(float4)>>>( ComputeNBodyGravitation_multiGPU_onethread<<<300,256,256*sizeof(float4)>>>(
dptrForce[i], dptrForce[i],
@ -116,12 +116,12 @@ ComputeGravitation_multiGPU_singlethread(
0, 0,
N, N,
N ); N );
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
g_hostAOS_gpuCrossCheckForce[i], g_hostAOS_gpuCrossCheckForce[i],
dptrForce[i], dptrForce[i],
3*N*sizeof(float), 3*N*sizeof(float),
cudaMemcpyDeviceToHost ) ); cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
g_hostAOS_Force+3*bodiesPerGPU*i, g_hostAOS_Force+3*bodiesPerGPU*i,
dptrForce[i]+3*bodiesPerGPU*i, dptrForce[i]+3*bodiesPerGPU*i,
3*bodiesPerGPU*sizeof(float), 3*bodiesPerGPU*sizeof(float),
@ -135,7 +135,7 @@ ComputeGravitation_multiGPU_singlethread(
i*bodiesPerGPU, i*bodiesPerGPU,
bodiesPerGPU, bodiesPerGPU,
N ); N );
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
g_hostAOS_Force+3*bodiesPerGPU*i, g_hostAOS_Force+3*bodiesPerGPU*i,
dptrForce[i], dptrForce[i],
3*bodiesPerGPU*sizeof(float), 3*bodiesPerGPU*sizeof(float),
@ -144,8 +144,8 @@ ComputeGravitation_multiGPU_singlethread(
} }
// Synchronize with each GPU in turn. // Synchronize with each GPU in turn.
for ( int i = 0; i < g_numGPUs; i++ ) { for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) ); cuda(SetDevice( i ) );
CUDART_CHECK( cudaDeviceSynchronize() ); cuda(DeviceSynchronize() );
} }
chTimerGetTime( &end ); chTimerGetTime( &end );
ret = chTimerElapsedTime( &start, &end ) * 1000.0f; ret = chTimerElapsedTime( &start, &end ) * 1000.0f;

8
nbody/nbody_multiGPU_threaded.cu

@ -86,9 +86,9 @@ gpuWorkerThread( void *_p )
// //
// Each GPU has its own device pointer to the host pointer. // Each GPU has its own device pointer to the host pointer.
// //
CUDART_CHECK( cudaMalloc( &dptrPosMass, 4*p->N*sizeof(float) ) ); cuda(Malloc( &dptrPosMass, 4*p->N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrForce, 3*p->n*sizeof(float) ) ); cuda(Malloc( &dptrForce, 3*p->n*sizeof(float) ) );
CUDART_CHECK( cudaMemcpyAsync( cuda(MemcpyAsync(
dptrPosMass, dptrPosMass,
p->hostPosMass, p->hostPosMass,
4*p->N*sizeof(float), 4*p->N*sizeof(float),
@ -102,7 +102,7 @@ gpuWorkerThread( void *_p )
p->N ); p->N );
// NOTE: synchronous memcpy, so no need for further // NOTE: synchronous memcpy, so no need for further
// synchronization with device // synchronization with device
CUDART_CHECK( cudaMemcpy( cuda(Memcpy(
p->hostForce+3*p->i, p->hostForce+3*p->i,
dptrForce, dptrForce,
3*p->n*sizeof(float), 3*p->n*sizeof(float),

Loading…
Cancel
Save