Browse Source

Use absolute error instead of relative error

hipcc
Nicholas Wilt 3 years ago
parent
commit
871c6a954d
  1. 154
      nbody2/nbody.cu
  2. 3
      nbody2/nbody.h

154
nbody2/nbody.cu

@ -92,14 +92,6 @@ randomUnitBodies( int seed, std::vector<PosMass<float>>& pos, std::vector<VelInv @@ -92,14 +92,6 @@ randomUnitBodies( int seed, std::vector<PosMass<float>>& pos, std::vector<VelInv
}
}
template<typename T>
static float
relError( float a, float b )
{
if ( a == b ) return 0.0f;
return fabsf(a-b)/b;
}
bool g_bCUDAPresent;
bool g_bSM30Present;
@ -145,6 +137,15 @@ relError( T a, T b ) @@ -145,6 +137,15 @@ relError( T a, T b )
return (relErr<0.0f) ? -relErr : relErr;
}
template<typename T>
static T
absError( T a, T b )
{
if ( a == b ) return 0.0f;
return (T) fabs(a-b);
}
#include "nbody_CPU_AOS.h"
//#include "nbody_CPU_AOS_tiled.h"
@ -161,14 +162,15 @@ relError( T a, T b ) @@ -161,14 +162,15 @@ relError( T a, T b )
//#include "nbody_GPU_Atomic.cuh"
#endif
template<typename T>
void
integrateGravitation_AOS( std::vector<PosMass<float>>& posMass, std::vector<VelInvMass<float>>& pvel, const std::vector<Force3D<float>>& pforce, float dt, float damping, size_t N )
NBodyAlgorithm<T>::integrateGravitation( T dt, T damping )
{
for ( size_t i = 0; i < N; i++ ) {
float pos[3] = { posMass[i].x_, posMass[i].y_, posMass[i].z_ };
float vel[3] = { pvel[i].dx_, pvel[i].dy_, pvel[i].dz_ };
float invMass = pvel[i].invMass_;
float force[3] = { pforce[i].ddx_, pforce[i].ddy_, pforce[i].ddz_ };
for ( size_t i = 0; i < N_; i++ ) {
float pos[3] = { posMass_[i].x_, posMass_[i].y_, posMass_[i].z_ };
float vel[3] = { velInvMass_[i].dx_, velInvMass_[i].dy_, velInvMass_[i].dz_ };
float invMass = velInvMass_[i].invMass_;
float force[3] = { force_[i].ddx_, force_[i].ddy_, force_[i].ddz_ };
// acceleration = force / mass;
// new velocity = old velocity + acceleration * deltaTime
@ -185,13 +187,13 @@ integrateGravitation_AOS( std::vector<PosMass<float>>& posMass, std::vector<VelI @@ -185,13 +187,13 @@ integrateGravitation_AOS( std::vector<PosMass<float>>& posMass, std::vector<VelI
pos[1] += vel[1] * dt;
pos[2] += vel[2] * dt;
posMass[i].x_ = pos[0];
posMass[i].y_ = pos[1];
posMass[i].z_ = pos[2];
posMass_[i].x_ = pos[0];
posMass_[i].y_ = pos[1];
posMass_[i].z_ = pos[2];
pvel[i].dx_ = vel[0];
pvel[i].dy_ = vel[1];
pvel[i].dz_ = vel[2];
velInvMass_[i].dx_ = vel[0];
velInvMass_[i].dy_ = vel[1];
velInvMass_[i].dz_ = vel[2];
}
}
@ -289,40 +291,32 @@ NBodyAlgorithm<T>::computeTimeStep( ) @@ -289,40 +291,32 @@ NBodyAlgorithm<T>::computeTimeStep( )
template<typename T>
__global__ void
ComputeNBodyGravitation_Shared(
ComputeNBodyGravitation_GPU_AOS(
Force3D<T> *force,
PosMass<T> *posMass,
T softeningSquared,
size_t N )
{
extern __shared__ float4 shPosMass[];
for ( int i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x )
{
float acc[3] = {0};
float4 myPosMass = ((float4 *) posMass)[i];
#pragma unroll 32
for ( int j = 0; j < N; j += blockDim.x ) {
shPosMass[threadIdx.x] = ((float4 *) posMass)[j+threadIdx.x];
__syncthreads();
for ( size_t k = 0; k < blockDim.x; k++ ) {
float fx, fy, fz;
float4 bodyPosMass = shPosMass[k];
bodyBodyInteraction(
&fx, &fy, &fz,
myPosMass.x, myPosMass.y, myPosMass.z,
bodyPosMass.x,
bodyPosMass.y,
bodyPosMass.z,
bodyPosMass.w,
softeningSquared );
acc[0] += fx;
acc[1] += fy;
acc[2] += fz;
}
__syncthreads();
T acc[3] = {0};
float4 me = ((float4 *) posMass)[i];
T myX = me.x;
T myY = me.y;
T myZ = me.z;
for ( int j = 0; j < N; j++ ) {
float4 body = ((float4 *) posMass)[j];
float fx, fy, fz;
bodyBodyInteraction(
&fx, &fy, &fz,
myX, myY, myZ,
body.x, body.y, body.z, body.w,
softeningSquared);
acc[0] += fx;
acc[1] += fy;
acc[2] += fz;
}
force[i].ddx_ = acc[0];
force[i].ddy_ = acc[1];
@ -340,7 +334,7 @@ NBodyAlgorithm_GPU<T>::computeTimeStep( ) @@ -340,7 +334,7 @@ NBodyAlgorithm_GPU<T>::computeTimeStep( )
cuda(Memcpy( thrust::raw_pointer_cast(gpuPosMass_.data()), NBodyAlgorithm<T>::posMass().data(), NBodyAlgorithm<T>::N()*sizeof(PosMass<float>), cudaMemcpyHostToDevice ) );
cuda(EventRecord( evStart_, NULL ) );
ComputeNBodyGravitation_Shared<<<1024,256, 256*sizeof(float4)>>>(
ComputeNBodyGravitation_GPU_AOS<<<1024,256>>>(
thrust::raw_pointer_cast(gpuForce_.data()),
thrust::raw_pointer_cast(gpuPosMass_.data()),
softeningSquared,
@ -413,7 +407,27 @@ ComputeGravitation( @@ -413,7 +407,27 @@ ComputeGravitation(
}
#endif
*maxRelError = 0.0f;
if ( bCrossCheck ) {
auto gpuPosMass = gpuAlgo->posMass();
auto refPosMass = refAlgo->posMass();
float max = 0.0f;
for ( size_t i = 0; i < g_N; i++ ) {
float xerr = absError( gpuPosMass[i].x_, refPosMass[i].x_ );
float yerr = absError( gpuPosMass[i].y_, refPosMass[i].y_ );
float zerr = absError( gpuPosMass[i].z_, refPosMass[i].z_ );
if ( xerr > max ) max = xerr;
if ( yerr > max ) max = yerr;
if ( zerr > max ) max = zerr;
}
*maxRelError = max;
printf( "%s crosscheck against gold %s: maxRelError = gpuForce-> %E\n",
gpuAlgo->getAlgoName(), refAlgo->getAlgoName(), max ); fflush( stdout );
}
*ms = gpuAlgo->computeTimeStep( );
#if 0
switch ( algorithm ) {
case CPU_AOS:
@ -598,25 +612,23 @@ ComputeGravitation( @@ -598,25 +612,23 @@ ComputeGravitation(
auto refForce = refAlgo->force();
float max = 0.0f;
for ( size_t i = 0; i < g_N; i++ ) {
float xerr = relError( gpuForce[i].ddx_, refForce[i].ddx_ );
float yerr = relError( gpuForce[i].ddy_, refForce[i].ddy_ );
float zerr = relError( gpuForce[i].ddz_, refForce[i].ddz_ );
float xerr = absError( gpuForce[i].ddx_, refForce[i].ddx_ );
float yerr = absError( gpuForce[i].ddy_, refForce[i].ddy_ );
float zerr = absError( gpuForce[i].ddz_, refForce[i].ddz_ );
if ( xerr >= 1.0f || yerr >= 1.0f || zerr >= 1.0 ) {
asm("int $3");
}
if ( xerr > max ) max = xerr;
if ( yerr > max ) max = yerr;
if ( zerr > max ) max = zerr;
}
*maxRelError = max;
printf( "%s crosscheck against gold %s: maxRelError = gpuForce-> %E\n",
gpuAlgo->getAlgoName(), refAlgo->getAlgoName(), max ); fflush( stdout );
//printf( "%s crosscheck against gold %s: maxRelError = gpuForce-> %E\n",
// gpuAlgo->getAlgoName(), refAlgo->getAlgoName(), max ); fflush( stdout );
refAlgo->integrateGravitation( g_dt, g_damping );
}
integrateGravitation_AOS(
gpuAlgo->posMass(),
gpuAlgo->velInvMass(),
gpuAlgo->force(),
g_dt,
g_damping,
g_N );
gpuAlgo->integrateGravitation( g_dt, g_damping );
if ( g_bGPUCrossCheck && g_fGPUCrosscheckInput ) {
if ( memcmp( g_hostAOS_Force.data(), g_hostAOS_Force_Golden.data(), 3*g_N*sizeof(float) ) ) {
@ -625,28 +637,6 @@ ComputeGravitation( @@ -625,28 +637,6 @@ ComputeGravitation(
}
}
#if 0
else {
KahanAdder sumX;
KahanAdder sumY;
KahanAdder sumZ;
for ( size_t i = 0; i < g_N; i++ ) {
sumX += g_hostAOS_Force[i*3+0];
sumY += g_hostAOS_Force[i*3+1];
sumZ += g_hostAOS_Force[i*3+2];
}
*maxRelError = std::max( fabs(sumX), std::max(fabs(sumY), fabs(sumZ)) );
if ( g_ZeroThreshold != 0.0 &&
fabs( *maxRelError ) > g_ZeroThreshold ) {
printf( "Maximum sum of forces > threshold (%E > %E)\n",
*maxRelError,
g_ZeroThreshold );
goto Error;
}
}
#endif
return true;
Error:
return false;
@ -1002,14 +992,14 @@ main( int argc, char *argv[] ) @@ -1002,14 +992,14 @@ main( int argc, char *argv[] )
double interactionsPerSecond = (double) g_N*g_N*1000.0f / ms;
if ( interactionsPerSecond > 1e9 ) {
printf ( "\r%s: %8.2f ms = %8.3fx10^9 interactions/s (Rel. error: %E)\n",
rgszAlgorithmNames[g_Algorithm],
gpuAlgo->getAlgoName(),
ms,
interactionsPerSecond/1e9,
err );
}
else {
printf ( "\r%s: %8.2f ms = %8.3fx10^6 interactions/s (Rel. error: %E)\n",
rgszAlgorithmNames[g_Algorithm],
gpuAlgo->getAlgoName(),
ms,
interactionsPerSecond/1e6,
err );

3
nbody2/nbody.h

@ -100,6 +100,7 @@ public: @@ -100,6 +100,7 @@ public:
// return value is elapsed time needed for the time step
virtual float computeTimeStep( );//std::vector<Force3D<T> >& force );
virtual void integrateGravitation( T dt, T damping );
// accessors
void setBody( size_t i, const PosMass<T>& body ) { posMass_[i] = body; }
@ -134,7 +135,7 @@ public: @@ -134,7 +135,7 @@ public:
}
virtual const char *getAlgoName() const { return "GPU AOS"; }
virtual bool Initialize( size_t N, int seed, T softening );
virtual float computeTimeStep( );//std::vector<Force3D<T> >& force );
virtual float computeTimeStep( );
private:
cudaEvent_t evStart_, evStop_;

Loading…
Cancel
Save