Browse Source

Get multiple GPUs working

nbody2
Nicholas Wilt 3 years ago
parent
commit
a0148b0f79
  1. 3
      chLib/chError_hip.h
  2. 66
      nbody2/nbody.cu
  3. 30
      nbody2/nbody.h

3
chLib/chError_hip.h

@ -43,6 +43,8 @@
#define __CHERROR_HIP_H__ #define __CHERROR_HIP_H__
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#ifndef DEBUG
#define cuda( fn ) do { \ #define cuda( fn ) do { \
status = (hip##fn); \ status = (hip##fn); \
if ( hipSuccess != (status) ) { \ if ( hipSuccess != (status) ) { \
@ -55,6 +57,7 @@
goto Error; \ goto Error; \
} \ } \
} while (0); } while (0);
#endif
typedef hipEvent_t cudaEvent_t; typedef hipEvent_t cudaEvent_t;
typedef hipError_t cudaError_t; typedef hipError_t cudaError_t;

66
nbody2/nbody.cu

@ -246,13 +246,12 @@ NBodyAlgorithm_GPU<T>::Initialize( size_t N, int seed, T softening )
evStop_ = std::vector<cudaEvent_t>( g_numGPUs ); evStop_ = std::vector<cudaEvent_t>( g_numGPUs );
gpuForce_ = std::vector< thrust::device_vector<Force3D<float>>>( g_numGPUs ); gpuForce_ = std::vector< thrust::device_vector<Force3D<float>>>( g_numGPUs );
gpuPosMass_ = std::vector< thrust::device_vector<PosMass<float>>>( g_numGPUs ); gpuPosMass_ = std::vector< thrust::device_vector<PosMass<float>>>( g_numGPUs );
gpuVelInvMass_ = std::vector< thrust::device_vector<VelInvMass<float>>>( g_numGPUs );
for ( size_t i = 0; i < g_numGPUs; i++ ) { for ( size_t i = 0; i < g_numGPUs; i++ ) {
cuda(SetDevice(i));
cuda(EventCreate( &evStart_[i] ) ); cuda(EventCreate( &evStart_[i] ) );
cuda(EventCreate( &evStop_[i] ) ); cuda(EventCreate( &evStop_[i] ) );
gpuForce_[i] = thrust::device_vector<Force3D<float>>( N ); gpuForce_[i] = thrust::device_vector<Force3D<float>>( N );
gpuPosMass_[i] = thrust::device_vector<PosMass<float>>( N ); gpuPosMass_[i] = thrust::device_vector<PosMass<float>>( N );
gpuVelInvMass_[i] = thrust::device_vector<VelInvMass<float>>( N );
} }
return true; return true;
Error: Error:
@ -543,16 +542,6 @@ NBodyAlgorithm_AVX<T>::computeTimeStep( )
ddy[i] = horizontal_sum_ps( ay ); ddy[i] = horizontal_sum_ps( ay );
ddz[i] = horizontal_sum_ps( az ); ddz[i] = horizontal_sum_ps( az );
#if 0
// Accumulate sum of four floats in the SSE register
ax = horizontal_sum_ps( ax );
ay = horizontal_sum_ps( ay );
az = horizontal_sum_ps( az );
_mm_store_ss( &pddx[i], _mm256_castps256_ps128( ax ) );
_mm_store_ss( &pddy[i], _mm256_castps256_ps128( ay ) );
_mm_store_ss( &pddz[i], _mm256_castps256_ps128( az ) );
#endif
} }
for ( size_t i = 0; i < N; i++ ) { for ( size_t i = 0; i < N; i++ ) {
force[i].ddx_ = ddx[i]; force[i].ddx_ = ddx[i];
@ -661,18 +650,20 @@ NBodyAlgorithm_FMA<T>::computeTimeStep( )
template<typename T> template<typename T>
__global__ void __global__ void
ComputeNBodyGravitation_GPU_AOS( ComputeNBodyGravitation_MultiGPU_AOS(
Force3D<T> *force, Force3D<T> *force,
PosMass<T> *posMass, PosMass<T> *posMass,
T softeningSquared, T softeningSquared,
size_t base_i,
size_t n,
size_t N ) size_t N )
{ {
for ( int i = blockIdx.x*blockDim.x + threadIdx.x; for ( int i = blockIdx.x*blockDim.x + threadIdx.x;
i < N; i < n;
i += blockDim.x*gridDim.x ) i += blockDim.x*gridDim.x )
{ {
T acc[3] = {0}; T acc[3] = {0};
float4 me = ((float4 *) posMass)[i]; float4 me = ((float4 *) posMass)[base_i+i];
T myX = me.x; T myX = me.x;
T myY = me.y; T myY = me.y;
T myZ = me.z; T myZ = me.z;
@ -688,9 +679,9 @@ ComputeNBodyGravitation_GPU_AOS(
acc[1] += fy; acc[1] += fy;
acc[2] += fz; acc[2] += fz;
} }
force[i].ddx_ = acc[0]; force[base_i+i].ddx_ = acc[0];
force[i].ddy_ = acc[1]; force[base_i+i].ddy_ = acc[1];
force[i].ddz_ = acc[2]; force[base_i+i].ddz_ = acc[2];
} }
} }
@ -701,30 +692,46 @@ NBodyAlgorithm_GPU<T>::computeTimeStep( )
cudaError_t status; cudaError_t status;
float ms = 0.0f; float ms = 0.0f;
float softeningSquared = NBodyAlgorithm<T>::softening()*NBodyAlgorithm<T>::softening(); float softeningSquared = NBodyAlgorithm<T>::softening()*NBodyAlgorithm<T>::softening();
size_t N = NBodyAlgorithm<T>::N();
size_t nGPUs = gpuPosMass_.size();
std::vector<size_t> startIndices(nGPUs);
size_t NperGPU = NBodyAlgorithm<T>::N() / nGPUs;
auto N_i = [N,NperGPU] ( size_t i ) -> size_t {
size_t i_gpu = i*NperGPU;
size_t ret = NperGPU;
if ( i_gpu+NperGPU > N )
ret = N-i_gpu;
return ret;
};
for ( size_t i = 0; i < gpuPosMass_.size(); i++ ) { // every GPU gets a copy of the bodies' positions
for ( size_t i = 0; i < nGPUs; i++ ) {
cuda(SetDevice(i)); cuda(SetDevice(i));
cuda(Memcpy( thrust::raw_pointer_cast(gpuPosMass_[i].data()), NBodyAlgorithm<T>::posMass().data(), NBodyAlgorithm<T>::N()*sizeof(PosMass<float>), cudaMemcpyHostToDevice ) ); cuda(Memcpy( thrust::raw_pointer_cast(gpuPosMass_[i].data()), NBodyAlgorithm<T>::posMass().data(), N*sizeof(PosMass<float>), cudaMemcpyHostToDevice ) );
cuda(EventRecord( evStart_[i], NULL ) ); cuda(EventRecord( evStart_[i], NULL ) );
} }
for ( size_t i = 0; i < gpuPosMass_.size(); i++ ) { for ( size_t i = 0; i < nGPUs; i++ ) {
cuda(SetDevice(i)); cuda(SetDevice(i));
ComputeNBodyGravitation_GPU_AOS<<<1024,256>>>( ComputeNBodyGravitation_MultiGPU_AOS<<<1024,256>>>(
thrust::raw_pointer_cast(gpuForce_[0].data()), thrust::raw_pointer_cast(gpuForce_[i].data()),
thrust::raw_pointer_cast(gpuPosMass_[0].data()), thrust::raw_pointer_cast(gpuPosMass_[i].data()),
softeningSquared, softeningSquared,
NBodyAlgorithm<T>::N() ); i*NperGPU,
N_i(i),
N );
cuda(EventRecord( evStop_[i], NULL ) ); cuda(EventRecord( evStop_[i], NULL ) );
} }
for ( size_t i = 0; i < gpuPosMass_.size(); i++ ) { for ( size_t i = 0; i < nGPUs; i++ ) {
cuda(SetDevice(i)); cuda(SetDevice(i));
cuda(DeviceSynchronize() ); cuda(DeviceSynchronize() );
} }
// report max time // report max time
for ( size_t i = 0; i < gpuForce_.size(); i++ ) { for ( size_t i = 0; i < nGPUs; i++ ) {
float et; float et;
cuda(SetDevice(i)); cuda(SetDevice(i));
cuda(Memcpy( NBodyAlgorithm<T>::force().data(), thrust::raw_pointer_cast(gpuForce_[0].data()), NBodyAlgorithm<T>::N()*sizeof(Force3D<float>), cudaMemcpyDeviceToHost ) ); cuda(Memcpy( NBodyAlgorithm<T>::force().data()+i*NperGPU, thrust::raw_pointer_cast(gpuForce_[i].data())+i*NperGPU, N_i(i)*sizeof(Force3D<float>), cudaMemcpyDeviceToHost ) );
cuda(EventElapsedTime( &et, evStart_[i], evStop_[i] ) ); cuda(EventElapsedTime( &et, evStart_[i], evStop_[i] ) );
if ( et > ms ) ms = et; if ( et > ms ) ms = et;
} }
@ -1081,7 +1088,8 @@ main( int argc, char *argv[] )
#endif #endif
status = cudaSuccess; status = cudaSuccess;
g_numGPUs = 1; //cuda(GetDeviceCount( &g_numGPUs ) ); cuda(GetDeviceCount( &g_numGPUs ) );
printf( "%d GPUs detected\n", g_numGPUs );
g_bCUDAPresent = (cudaSuccess == status) && (g_numGPUs > 0); g_bCUDAPresent = (cudaSuccess == status) && (g_numGPUs > 0);
if ( g_bCUDAPresent ) { if ( g_bCUDAPresent ) {
cudaDeviceProp prop; cudaDeviceProp prop;

30
nbody2/nbody.h

@ -137,40 +137,24 @@ public:
virtual bool Initialize( size_t N, int seed, T softening ); virtual bool Initialize( size_t N, int seed, T softening );
virtual float computeTimeStep( ); virtual float computeTimeStep( );
// Processes i'th subarray for the timestep. std::vector<thrust::device_vector<Force3D<float>>>& gpuForce() { return gpuForce_; }
// This virtual function is used to explore different GPU std::vector<thrust::device_vector<PosMass<float>>>& gpuPosMass() { return gpuPosMass_; }
// implementations without duplicating the multi-GPU code.
//virtual float gpuComputeTimeSubstep( size_t i ); const std::vector<thrust::device_vector<Force3D<float>>>& gpuForce() const { return gpuForce_; }
const std::vector<thrust::device_vector<PosMass<float>>>& gpuPosMass() const { return gpuPosMass_; }
private: private:
std::vector<cudaEvent_t> evStart_, evStop_; std::vector<cudaEvent_t> evStart_, evStop_;
std::vector<thrust::device_vector<Force3D<float>>> gpuForce_; std::vector<thrust::device_vector<Force3D<float>>> gpuForce_;
std::vector<thrust::device_vector<PosMass<float>>> gpuPosMass_; std::vector<thrust::device_vector<PosMass<float>>> gpuPosMass_;
std::vector<thrust::device_vector<VelInvMass<float>>> gpuVelInvMass_;
}; };
template<typename T> template<typename T>
class NBodyAlgorithm_MultiGPU : public NBodyAlgorithm_GPU<T> { class NBodyAlgorithm_MultiGPU : public NBodyAlgorithm_GPU<T> {
public: public:
inline NBodyAlgorithm_MultiGPU<T>() { evStart_ = evStop_ = nullptr; } virtual const char *getAlgoName() const { return "multiGPU AOS"; }
virtual ~NBodyAlgorithm_MultiGPU<T>() {
cudaEventDestroy( evStart_ );
cudaEventDestroy( evStop_ );
}
virtual const char *getAlgoName() const { return "GPU AOS"; }
virtual bool Initialize( size_t N, int seed, T softening );
virtual float computeTimeStep( ); virtual float computeTimeStep( );
// Processes i'th subarray for the timestep.
// This virtual function is used to explore different GPU
// implementations without duplicating the multi-GPU code.
virtual float gpuComputeTimeSubstep( size_t i );
private:
cudaEvent_t evStart_, evStop_;
std::vector<thrust::device_vector<Force3D<float>>> gpuForce_;
std::vector<thrust::device_vector<PosMass<float>>> gpuPosMass_;
std::vector<thrust::device_vector<VelInvMass<float>>> gpuVelInvMass_;
}; };

Loading…
Cancel
Save