5 changed files with 306 additions and 1 deletions
@ -0,0 +1,116 @@ |
|||||||
|
/*
|
||||||
|
* |
||||||
|
* bodybodyInteraction_AltiVec.h |
||||||
|
* |
||||||
|
* SSE implementation of N-body computation. |
||||||
|
* |
||||||
|
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or without |
||||||
|
* modification, are permitted provided that the following conditions |
||||||
|
* are met: |
||||||
|
* |
||||||
|
* 1. Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer in |
||||||
|
* the documentation and/or other materials provided with the |
||||||
|
* distribution. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||||
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||||
|
* POSSIBILITY OF SUCH DAMAGE. |
||||||
|
* |
||||||
|
*/ |
||||||
|
|
||||||
|
#ifdef __ARM_NEON__ |
||||||
|
|
||||||
|
#include <arm_neon.h> |
||||||
|
|
||||||
|
typedef float vf32x4_t __attribute__ ((vector_size(16),aligned(1))); |
||||||
|
|
||||||
|
static const vf32x4_t vec_zero = {0.0f, 0.0f, 0.0f, 0.0f}; |
||||||
|
|
||||||
|
typedef union { |
||||||
|
float32x4_t v; |
||||||
|
float f[4]; |
||||||
|
vf32x4_t p; |
||||||
|
} v4; |
||||||
|
|
||||||
|
static inline vf32x4_t |
||||||
|
_vec_set_ps1(float f) |
||||||
|
{ |
||||||
|
v4 r; |
||||||
|
r.v = vdupq_n_f32(f); |
||||||
|
return r.p; |
||||||
|
} |
||||||
|
|
||||||
|
static inline float |
||||||
|
_vec_sum(vf32x4_t const &v) |
||||||
|
{ |
||||||
|
float32x2_t r; |
||||||
|
v4 iv; |
||||||
|
iv.p = v; |
||||||
|
r = vadd_f32(vget_high_f32(iv.v), vget_low_f32(iv.v)); |
||||||
|
return vget_lane_f32(vpadd_f32(r, r), 0); |
||||||
|
} |
||||||
|
|
||||||
|
static inline vf32x4_t |
||||||
|
rcp_sqrt_nr_ps(const vf32x4_t& _v) { |
||||||
|
v4 vec, result; |
||||||
|
vec.p = _v; |
||||||
|
result.v = vrsqrteq_f32(vec.v); |
||||||
|
result.v = vmulq_f32(vrsqrtsq_f32(vmulq_f32(result.v, result.v), vec.v), result.v); |
||||||
|
return result.p; |
||||||
|
} |
||||||
|
|
||||||
|
inline void |
||||||
|
bodyBodyInteraction( |
||||||
|
vf32x4_t& fx, |
||||||
|
vf32x4_t& fy, |
||||||
|
vf32x4_t& fz, |
||||||
|
|
||||||
|
const vf32x4_t& x0, |
||||||
|
const vf32x4_t& y0, |
||||||
|
const vf32x4_t& z0, |
||||||
|
|
||||||
|
const vf32x4_t& x1, |
||||||
|
const vf32x4_t& y1, |
||||||
|
const vf32x4_t& z1, |
||||||
|
const vf32x4_t& mass1, |
||||||
|
|
||||||
|
const vf32x4_t& softeningSquared ) |
||||||
|
{ |
||||||
|
// r_01 [3 FLOPS]
|
||||||
|
vf32x4_t dx = x1 - x0; |
||||||
|
vf32x4_t dy = y1 - y0; |
||||||
|
vf32x4_t dz = z1 - z0; |
||||||
|
|
||||||
|
// d^2 + e^2 [6 FLOPS]
|
||||||
|
vf32x4_t distSq = ( dx * dx ) + ( dy * dy ) + ( dz * dz ); |
||||||
|
distSq = distSq + softeningSquared; |
||||||
|
|
||||||
|
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
|
||||||
|
vf32x4_t invDist = rcp_sqrt_nr_ps ( distSq ); |
||||||
|
vf32x4_t invDistCube = invDist * invDist * invDist; |
||||||
|
|
||||||
|
// s = m_j * invDistCube [1 FLOP]
|
||||||
|
vf32x4_t s = mass1 * invDistCube; |
||||||
|
|
||||||
|
// (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS]
|
||||||
|
fx = fx + (dx * s); |
||||||
|
fy = fy + (dx * s); |
||||||
|
fz = fz + (dz * s); |
||||||
|
} |
||||||
|
|
||||||
|
#endif |
@ -0,0 +1,90 @@ |
|||||||
|
/*
|
||||||
|
* |
||||||
|
* nbody_CPU_NEON.cpp |
||||||
|
* |
||||||
|
* Multithreaded NEON CPU implementation of the O(N^2) N-body calculation. |
||||||
|
* Uses SOA (structure of arrays) representation because it is a much |
||||||
|
* better fit for NEON. |
||||||
|
* |
||||||
|
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or without |
||||||
|
* modification, are permitted provided that the following conditions |
||||||
|
* are met: |
||||||
|
* |
||||||
|
* 1. Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer in |
||||||
|
* the documentation and/or other materials provided with the |
||||||
|
* distribution. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||||
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||||
|
* POSSIBILITY OF SUCH DAMAGE. |
||||||
|
* |
||||||
|
*/ |
||||||
|
|
||||||
|
#ifdef __ARM_NEON__ |
||||||
|
#include <chTimer.h> |
||||||
|
|
||||||
|
#include "nbody.h" |
||||||
|
#include "bodybodyInteraction_NEON.h" |
||||||
|
#include "nbody_CPU_SIMD.h" |
||||||
|
|
||||||
|
float |
||||||
|
ComputeGravitation_SIMD( |
||||||
|
float *force[3], |
||||||
|
float *pos[4], |
||||||
|
float *mass, |
||||||
|
float softeningSquared, |
||||||
|
size_t N |
||||||
|
) |
||||||
|
{ |
||||||
|
chTimerTimestamp start, end; |
||||||
|
chTimerGetTime( &start ); |
||||||
|
|
||||||
|
for (size_t i = 0; i < N; i++) |
||||||
|
{ |
||||||
|
vf32x4_t ax = vec_zero; |
||||||
|
vf32x4_t ay = vec_zero; |
||||||
|
vf32x4_t az = vec_zero; |
||||||
|
vf32x4_t *px = (vf32x4_t *) pos[0]; |
||||||
|
vf32x4_t *py = (vf32x4_t *) pos[1]; |
||||||
|
vf32x4_t *pz = (vf32x4_t *) pos[2]; |
||||||
|
vf32x4_t *pmass = (vf32x4_t *) mass; |
||||||
|
vf32x4_t x0 = _vec_set_ps1( pos[0][i] ); |
||||||
|
vf32x4_t y0 = _vec_set_ps1( pos[1][i] ); |
||||||
|
vf32x4_t z0 = _vec_set_ps1( pos[2][i] ); |
||||||
|
|
||||||
|
for ( size_t j = 0; j < N/4; j++ ) { |
||||||
|
|
||||||
|
bodyBodyInteraction( |
||||||
|
ax, ay, az, |
||||||
|
x0, y0, z0, |
||||||
|
px[j], py[j], pz[j], pmass[j], |
||||||
|
_vec_set_ps1( softeningSquared ) ); |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
// Accumulate sum of four floats in the NEON register
|
||||||
|
force[0][i] = _vec_sum( ax ); |
||||||
|
force[1][i] = _vec_sum( ay ); |
||||||
|
force[2][i] = _vec_sum( az ); |
||||||
|
} |
||||||
|
|
||||||
|
chTimerGetTime( &end ); |
||||||
|
|
||||||
|
return (float) chTimerElapsedTime( &start, &end ) * 1000.0f; |
||||||
|
} |
||||||
|
#endif |
@ -0,0 +1,93 @@ |
|||||||
|
/*
|
||||||
|
* |
||||||
|
* nbody_CPU_NEON.cpp |
||||||
|
* |
||||||
|
* Multithreaded NEON CPU implementation of the O(N^2) N-body calculation. |
||||||
|
* Uses SOA (structure of arrays) representation because it is a much |
||||||
|
* better fit for NEON. |
||||||
|
* |
||||||
|
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||||
|
* All rights reserved. |
||||||
|
* |
||||||
|
* Redistribution and use in source and binary forms, with or without |
||||||
|
* modification, are permitted provided that the following conditions |
||||||
|
* are met: |
||||||
|
* |
||||||
|
* 1. Redistributions of source code must retain the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer. |
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright |
||||||
|
* notice, this list of conditions and the following disclaimer in |
||||||
|
* the documentation and/or other materials provided with the |
||||||
|
* distribution. |
||||||
|
* |
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||||
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||||
|
* POSSIBILITY OF SUCH DAMAGE. |
||||||
|
* |
||||||
|
*/ |
||||||
|
|
||||||
|
#ifdef __ARM_NEON__ |
||||||
|
#ifdef _OPENMP |
||||||
|
#include <chTimer.h> |
||||||
|
|
||||||
|
#include "nbody.h" |
||||||
|
#include "bodybodyInteraction_NEON.h" |
||||||
|
#include "nbody_CPU_SIMD.h" |
||||||
|
|
||||||
|
float |
||||||
|
ComputeGravitation_SIMD_openmp( |
||||||
|
float *force[3], |
||||||
|
float *pos[4], |
||||||
|
float *mass, |
||||||
|
float softeningSquared, |
||||||
|
size_t N |
||||||
|
) |
||||||
|
{ |
||||||
|
chTimerTimestamp start, end; |
||||||
|
chTimerGetTime( &start ); |
||||||
|
|
||||||
|
#pragma omp parallel for |
||||||
|
for (size_t i = 0; i < N; i++) |
||||||
|
{ |
||||||
|
vf32x4_t ax = vec_zero; |
||||||
|
vf32x4_t ay = vec_zero; |
||||||
|
vf32x4_t az = vec_zero; |
||||||
|
vf32x4_t *px = (vf32x4_t *) pos[0]; |
||||||
|
vf32x4_t *py = (vf32x4_t *) pos[1]; |
||||||
|
vf32x4_t *pz = (vf32x4_t *) pos[2]; |
||||||
|
vf32x4_t *pmass = (vf32x4_t *) mass; |
||||||
|
vf32x4_t x0 = _vec_set_ps1( pos[0][i] ); |
||||||
|
vf32x4_t y0 = _vec_set_ps1( pos[1][i] ); |
||||||
|
vf32x4_t z0 = _vec_set_ps1( pos[2][i] ); |
||||||
|
|
||||||
|
for ( size_t j = 0; j < N/4; j++ ) { |
||||||
|
|
||||||
|
bodyBodyInteraction( |
||||||
|
ax, ay, az, |
||||||
|
x0, y0, z0, |
||||||
|
px[j], py[j], pz[j], pmass[j], |
||||||
|
_vec_set_ps1( softeningSquared ) ); |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
// Accumulate sum of four floats in the NEON register
|
||||||
|
force[0][i] = _vec_sum( ax ); |
||||||
|
force[1][i] = _vec_sum( ay ); |
||||||
|
force[2][i] = _vec_sum( az ); |
||||||
|
} |
||||||
|
|
||||||
|
chTimerGetTime( &end ); |
||||||
|
|
||||||
|
return (float) chTimerElapsedTime( &start, &end ) * 1000.0f; |
||||||
|
} |
||||||
|
#endif |
||||||
|
#endif |
Loading…
Reference in new issue