Browse Source

Ported nbody to new error handling

divergence
Nicholas Wilt 9 years ago
parent
commit
fd03ad3694
  1. 58
      nbody/nbody.cu
  2. 12
      nbody/nbody_GPU_AOS.cuh
  3. 16
      nbody/nbody_GPU_AOS_const.cuh
  4. 20
      nbody/nbody_GPU_AOS_tiled.cuh
  5. 22
      nbody/nbody_GPU_AOS_tiled_const.cuh
  6. 18
      nbody/nbody_GPU_Atomic.cuh
  7. 32
      nbody/nbody_GPU_SOA_tiled.cuh
  8. 16
      nbody/nbody_GPU_Shuffle.cuh
  9. 16
      nbody/nbody_GPU_shared.cu
  10. 22
      nbody/nbody_multiGPU.cu
  11. 8
      nbody/nbody_multiGPU_threaded.cu

58
nbody/nbody.cu

@ -267,7 +267,7 @@ ComputeGravitation( @@ -267,7 +267,7 @@ ComputeGravitation(
// CPU->GPU copies in case we are measuring GPU performance
if ( g_bCUDAPresent ) {
CUDART_CHECK( cudaMemcpyAsync(
cuda(MemcpyAsync(
g_dptrAOS_PosMass,
g_hostAOS_PosMass,
4*g_N*sizeof(float),
@ -338,7 +338,7 @@ ComputeGravitation( @@ -338,7 +338,7 @@ ComputeGravitation(
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
case GPU_AOS_tiled:
*ms = ComputeGravitation_GPU_AOS_tiled(
@ -346,7 +346,7 @@ ComputeGravitation( @@ -346,7 +346,7 @@ ComputeGravitation(
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
case GPU_AOS_tiled_const:
*ms = ComputeGravitation_GPU_AOS_tiled_const(
@ -354,46 +354,46 @@ ComputeGravitation( @@ -354,46 +354,46 @@ ComputeGravitation(
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
#if 0
// commented out - too slow even on SM 3.0
case GPU_Atomic:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*sizeof(float) ) );
cuda(Memset( g_dptrAOS_Force, 0, 3*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Atomic(
g_dptrAOS_Force,
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
#endif
case GPU_Shared:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Shared(
g_dptrAOS_Force,
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
case GPU_Const:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeNBodyGravitation_GPU_AOS_const(
g_dptrAOS_Force,
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
case GPU_Shuffle:
CUDART_CHECK( cudaMemset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
cuda(Memset( g_dptrAOS_Force, 0, 3*g_N*sizeof(float) ) );
*ms = ComputeGravitation_GPU_Shuffle(
g_dptrAOS_Force,
g_dptrAOS_PosMass,
g_softening*g_softening,
g_N );
CUDART_CHECK( cudaMemcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
cuda(Memcpy( g_hostAOS_Force, g_dptrAOS_Force, 3*g_N*sizeof(float), cudaMemcpyDeviceToHost ) );
break;
case multiGPU_SingleCPUThread:
memset( g_hostAOS_Force, 0, 3*g_N*sizeof(float) );
@ -516,9 +516,9 @@ initializeGPU( void *_p ) @@ -516,9 +516,9 @@ initializeGPU( void *_p )
cudaError_t status;
gpuInit_struct *p = (gpuInit_struct *) _p;
CUDART_CHECK( cudaSetDevice( p->iGPU ) );
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaFree(0) );
cuda(SetDevice( p->iGPU ) );
cuda(SetDeviceFlags( cudaDeviceMapHost ) );
cuda(Free(0) );
Error:
p->status = status;
}
@ -560,7 +560,7 @@ main( int argc, char *argv[] ) @@ -560,7 +560,7 @@ main( int argc, char *argv[] )
g_bCUDAPresent = (cudaSuccess == status) && (g_numGPUs > 0);
if ( g_bCUDAPresent ) {
cudaDeviceProp prop;
CUDART_CHECK( cudaGetDeviceProperties( &prop, 0 ) );
cuda(GetDeviceProperties( &prop, 0 ) );
g_bSM30Present = prop.major >= 3;
}
g_bNoCPU = chCommandLineGetBool( "nocpu", argc, argv );
@ -711,27 +711,27 @@ main( int argc, char *argv[] ) @@ -711,27 +711,27 @@ main( int argc, char *argv[] )
if ( g_bCUDAPresent ) {
cudaDeviceProp propForVersion;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaGetDeviceProperties( &propForVersion, 0 ) );
cuda(SetDeviceFlags( cudaDeviceMapHost ) );
cuda(GetDeviceProperties( &propForVersion, 0 ) );
if ( propForVersion.major < 3 ) {
// Only SM 3.x supports shuffle and fast atomics, so we cannot run
// some algorithms on this board.
g_maxAlgorithm = multiGPU_MultiCPUThread;
}
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_PosMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostAOS_PosMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
for ( int i = 0; i < 3; i++ ) {
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Pos[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Force[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostSOA_Pos[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostSOA_Force[i], g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
}
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_Force, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_Force_Golden, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostAOS_VelInvMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_Mass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostAlloc( (void **) &g_hostSOA_InvMass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostAOS_Force, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostAOS_Force_Golden, 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostAOS_VelInvMass, 4*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostSOA_Mass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) &g_hostSOA_InvMass, g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
CUDART_CHECK( cudaMalloc( &g_dptrAOS_PosMass, 4*g_N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( (void **) &g_dptrAOS_Force, 3*g_N*sizeof(float) ) );
cuda(Malloc( &g_dptrAOS_PosMass, 4*g_N*sizeof(float) ) );
cuda(Malloc( (void **) &g_dptrAOS_Force, 3*g_N*sizeof(float) ) );
if ( g_bGPUCrossCheck ) {
printf( "GPU cross check enabled (%d GPUs), disabling CPU\n", g_numGPUs );
@ -742,7 +742,7 @@ main( int argc, char *argv[] ) @@ -742,7 +742,7 @@ main( int argc, char *argv[] )
goto Error;
}
for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaHostAlloc( (void **) (&g_hostAOS_gpuCrossCheckForce[i]), 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
cuda(HostAlloc( (void **) (&g_hostAOS_gpuCrossCheckForce[i]), 3*g_N*sizeof(float), cudaHostAllocPortable|cudaHostAllocMapped ) );
}
}
}

12
nbody/nbody_GPU_AOS.cuh

@ -79,14 +79,14 @@ ComputeGravitation_GPU_AOS( @@ -79,14 +79,14 @@ ComputeGravitation_GPU_AOS(
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_GPU_AOS<float> <<<300,256>>>(
force, posMass, N, softeningSquared );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
cudaEventDestroy( evStop );
cudaEventDestroy( evStart );

16
nbody/nbody_GPU_AOS_const.cuh

@ -89,18 +89,18 @@ ComputeNBodyGravitation_GPU_AOS_const( @@ -89,18 +89,18 @@ ComputeNBodyGravitation_GPU_AOS_const(
size_t bodiesLeft = N;
void *p;
CUDART_CHECK( cudaGetSymbolAddress( &p, g_constantBodies ) );
cuda(GetSymbolAddress( &p, g_constantBodies ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
for ( size_t i = 0; i < N; i += g_bodiesPerPass ) {
// bodiesThisPass = max(bodiesLeft, g_bodiesPerPass);
size_t bodiesThisPass = bodiesLeft;
if ( bodiesThisPass > g_bodiesPerPass ) {
bodiesThisPass = g_bodiesPerPass;
}
CUDART_CHECK( cudaMemcpyToSymbolAsync(
cuda(MemcpyToSymbolAsync(
g_constantBodies,
((float4 *) posMass)+i,
bodiesThisPass*sizeof(float4),
@ -111,9 +111,9 @@ ComputeNBodyGravitation_GPU_AOS_const( @@ -111,9 +111,9 @@ ComputeNBodyGravitation_GPU_AOS_const(
force, posMass, softeningSquared, bodiesThisPass, N );
bodiesLeft -= bodiesThisPass;
}
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
cudaEventDestroy( evStop );
cudaEventDestroy( evStart );

20
nbody/nbody_GPU_AOS_tiled.cuh

@ -186,9 +186,9 @@ ComputeGravitation_GPU_AOS_tiled( @@ -186,9 +186,9 @@ ComputeGravitation_GPU_AOS_tiled(
cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) );
cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_tiled<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() );
cuda(DeviceSynchronize() );
Error:
return status;
}
@ -204,20 +204,20 @@ ComputeGravitation_GPU_AOS_tiled( @@ -204,20 +204,20 @@ ComputeGravitation_GPU_AOS_tiled(
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled<128>(
force,
posMass,
softeningSquared,
N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}
#else

22
nbody/nbody_GPU_AOS_tiled_const.cuh

@ -303,9 +303,9 @@ ComputeGravitation_GPU_AOS_tiled_const( @@ -303,9 +303,9 @@ ComputeGravitation_GPU_AOS_tiled_const(
cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) );
cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_tiled_const<nTile><<<blocks,nTile>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() );
cuda(DeviceSynchronize() );
Error:
return status;
}
@ -321,20 +321,20 @@ ComputeGravitation_GPU_AOS_tiled_const( @@ -321,20 +321,20 @@ ComputeGravitation_GPU_AOS_tiled_const(
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0;
CUDART_CHECK( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(DeviceSetCacheConfig( cudaFuncCachePreferShared ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_AOS_tiled_const<32>(
force,
posMass,
softeningSquared,
N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}

18
nbody/nbody_GPU_Atomic.cuh

@ -117,16 +117,16 @@ ComputeGravitation_GPU_Atomic( @@ -117,16 +117,16 @@ ComputeGravitation_GPU_Atomic(
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
CUDART_CHECK( cudaMemset( force, 0, 3*N*sizeof(float) ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
cuda(Memset( force, 0, 3*N*sizeof(float) ) );
ComputeNBodyGravitation_Atomic<float> <<<300,256>>>( force, posMass, N, softeningSquared );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}

32
nbody/nbody_GPU_SOA_tiled.cuh

@ -165,11 +165,11 @@ ComputeGravitation_GPU_SOA_tiled( @@ -165,11 +165,11 @@ ComputeGravitation_GPU_SOA_tiled(
cudaError_t status;
dim3 blocks( N/nTile, N/32, 1 );
CUDART_CHECK( cudaMemset( forces[0], 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( forces[1], 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( forces[2], 0, N*sizeof(float) ) );
cuda(Memset( forces[0], 0, N*sizeof(float) ) );
cuda(Memset( forces[1], 0, N*sizeof(float) ) );
cuda(Memset( forces[2], 0, N*sizeof(float) ) );
ComputeNBodyGravitation_GPU_SOA_tiled<nTile><<<blocks,nTile>>>( forces[0], forces[1], forces[2], posMass, N, softeningSquared );
CUDART_CHECK( cudaDeviceSynchronize() );
cuda(DeviceSynchronize() );
Error:
return status;
}
@ -212,31 +212,31 @@ ComputeGravitation_GPU_SOA_tiled( @@ -212,31 +212,31 @@ ComputeGravitation_GPU_SOA_tiled(
float ms = 0.0;
float *forces[3] = {0};
CUDART_CHECK( cudaMalloc( &forces[0], N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &forces[1], N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &forces[2], N*sizeof(float) ) );
cuda(Malloc( &forces[0], N*sizeof(float) ) );
cuda(Malloc( &forces[1], N*sizeof(float) ) );
cuda(Malloc( &forces[2], N*sizeof(float) ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
AOStoSOA_GPU_3<<<300,256>>>( forces[0], forces[1], forces[2], force, N );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventRecord( evStart, NULL ) );
CUDART_CHECK( ComputeGravitation_GPU_SOA_tiled<128>(
forces,
posMass,
softeningSquared,
N ) );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
cuda(EventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
cuda(DeviceSynchronize() );
SOAtoAOS_GPU_3<<<300,256>>>( force, forces[0], forces[1], forces[2], N );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}

16
nbody/nbody_GPU_Shuffle.cuh

@ -117,15 +117,15 @@ ComputeGravitation_GPU_Shuffle( float *force, float *posMass, float softeningSqu @@ -117,15 +117,15 @@ ComputeGravitation_GPU_Shuffle( float *force, float *posMass, float softeningSqu
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0f;
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_Shuffle <<<300,256>>>( force, posMass, softeningSquared, N );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}

16
nbody/nbody_GPU_shared.cu

@ -89,19 +89,19 @@ ComputeGravitation_GPU_Shared( @@ -89,19 +89,19 @@ ComputeGravitation_GPU_Shared(
cudaError_t status;
cudaEvent_t evStart = 0, evStop = 0;
float ms = 0.0;
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
CUDART_CHECK( cudaEventRecord( evStart, NULL ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
cuda(EventRecord( evStart, NULL ) );
ComputeNBodyGravitation_Shared<<<300,256, 256*sizeof(float4)>>>(
force,
posMass,
softeningSquared,
N );
CUDART_CHECK( cudaEventRecord( evStop, NULL ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventElapsedTime( &ms, evStart, evStop ) );
cuda(EventRecord( evStop, NULL ) );
cuda(DeviceSynchronize() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
Error:
CUDART_CHECK( cudaEventDestroy( evStop ) );
CUDART_CHECK( cudaEventDestroy( evStart ) );
cuda(EventDestroy( evStop ) );
cuda(EventDestroy( evStart ) );
return ms;
}

22
nbody/nbody_multiGPU.cu

@ -89,25 +89,25 @@ ComputeGravitation_multiGPU_singlethread( @@ -89,25 +89,25 @@ ComputeGravitation_multiGPU_singlethread(
if ( (0 != N % g_numGPUs) || (g_numGPUs > g_maxGPUs) ) {
return 0.0f;
}
CUDART_CHECK( cudaGetDevice( &oldDevice ) );
cuda(GetDevice( &oldDevice ) );
// kick off the asynchronous memcpy's - overlap GPUs pulling
// host memory with the CPU time needed to do the memory
// allocations.
for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) );
CUDART_CHECK( cudaMalloc( &dptrPosMass[i], 4*N*sizeof(float) ) );
cuda(SetDevice( i ) );
cuda(Malloc( &dptrPosMass[i], 4*N*sizeof(float) ) );
// we only need 3*N floatsw for the cross-check. otherwise we
// would need 3*bodiesPerGPU
CUDART_CHECK( cudaMalloc( &dptrForce[i], 3*N*sizeof(float) ) );
CUDART_CHECK( cudaMemcpyAsync(
cuda(Malloc( &dptrForce[i], 3*N*sizeof(float) ) );
cuda(MemcpyAsync(
dptrPosMass[i],
g_hostAOS_PosMass,
4*N*sizeof(float),
cudaMemcpyHostToDevice ) );
}
for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) );
cuda(SetDevice( i ) );
if ( g_bGPUCrossCheck ) {
ComputeNBodyGravitation_multiGPU_onethread<<<300,256,256*sizeof(float4)>>>(
dptrForce[i],
@ -116,12 +116,12 @@ ComputeGravitation_multiGPU_singlethread( @@ -116,12 +116,12 @@ ComputeGravitation_multiGPU_singlethread(
0,
N,
N );
CUDART_CHECK( cudaMemcpyAsync(
cuda(MemcpyAsync(
g_hostAOS_gpuCrossCheckForce[i],
dptrForce[i],
3*N*sizeof(float),
cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaMemcpyAsync(
cuda(MemcpyAsync(
g_hostAOS_Force+3*bodiesPerGPU*i,
dptrForce[i]+3*bodiesPerGPU*i,
3*bodiesPerGPU*sizeof(float),
@ -135,7 +135,7 @@ ComputeGravitation_multiGPU_singlethread( @@ -135,7 +135,7 @@ ComputeGravitation_multiGPU_singlethread(
i*bodiesPerGPU,
bodiesPerGPU,
N );
CUDART_CHECK( cudaMemcpyAsync(
cuda(MemcpyAsync(
g_hostAOS_Force+3*bodiesPerGPU*i,
dptrForce[i],
3*bodiesPerGPU*sizeof(float),
@ -144,8 +144,8 @@ ComputeGravitation_multiGPU_singlethread( @@ -144,8 +144,8 @@ ComputeGravitation_multiGPU_singlethread(
}
// Synchronize with each GPU in turn.
for ( int i = 0; i < g_numGPUs; i++ ) {
CUDART_CHECK( cudaSetDevice( i ) );
CUDART_CHECK( cudaDeviceSynchronize() );
cuda(SetDevice( i ) );
cuda(DeviceSynchronize() );
}
chTimerGetTime( &end );
ret = chTimerElapsedTime( &start, &end ) * 1000.0f;

8
nbody/nbody_multiGPU_threaded.cu

@ -86,9 +86,9 @@ gpuWorkerThread( void *_p ) @@ -86,9 +86,9 @@ gpuWorkerThread( void *_p )
//
// Each GPU has its own device pointer to the host pointer.
//
CUDART_CHECK( cudaMalloc( &dptrPosMass, 4*p->N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrForce, 3*p->n*sizeof(float) ) );
CUDART_CHECK( cudaMemcpyAsync(
cuda(Malloc( &dptrPosMass, 4*p->N*sizeof(float) ) );
cuda(Malloc( &dptrForce, 3*p->n*sizeof(float) ) );
cuda(MemcpyAsync(
dptrPosMass,
p->hostPosMass,
4*p->N*sizeof(float),
@ -102,7 +102,7 @@ gpuWorkerThread( void *_p ) @@ -102,7 +102,7 @@ gpuWorkerThread( void *_p )
p->N );
// NOTE: synchronous memcpy, so no need for further
// synchronization with device
CUDART_CHECK( cudaMemcpy(
cuda(Memcpy(
p->hostForce+3*p->i,
dptrForce,
3*p->n*sizeof(float),

Loading…
Cancel
Save