3 changed files with 392 additions and 0 deletions
@ -0,0 +1,89 @@
@@ -0,0 +1,89 @@
|
||||
/* |
||||
* |
||||
* scanWarp.cuh |
||||
* |
||||
* Device functions to perform scan within a 32-thread warp. |
||||
* |
||||
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in |
||||
* the documentation and/or other materials provided with the |
||||
* distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||
* POSSIBILITY OF SUCH DAMAGE. |
||||
* |
||||
*/ |
||||
|
||||
#ifndef __SCAN_WARP_CUH__ |
||||
#define __SCAN_WARP_CUH__ |
||||
|
||||
#if 0 |
||||
/* |
||||
* scanWarp - assumes no zero padding |
||||
*/ |
||||
template<class T> |
||||
inline __device__ T |
||||
scanWarp( volatile T *sPartials ) |
||||
{ |
||||
const int tid = threadIdx.x; |
||||
const int lane = tid & 31; |
||||
|
||||
if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; |
||||
if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; |
||||
if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; |
||||
if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; |
||||
if ( lane >= 16 ) sPartials[0] += sPartials[-16]; |
||||
return sPartials[0]; |
||||
} |
||||
#endif |
||||
|
||||
/* |
||||
* scanWarp - bZeroPadded template parameter specifies |
||||
* whether to conditionally add based on the lane ID. |
||||
* If we can assume that sPartials[-1..-16] is 0, |
||||
* the routine takes fewer instructions. |
||||
*/ |
||||
template<class T, bool bZeroPadded> |
||||
inline __device__ T |
||||
scanWarp( volatile T *sPartials ) |
||||
{ |
||||
T t = sPartials[0]; |
||||
if ( bZeroPadded ) { |
||||
t += sPartials[- 1]; sPartials[0] = t; |
||||
t += sPartials[- 2]; sPartials[0] = t; |
||||
t += sPartials[- 4]; sPartials[0] = t; |
||||
t += sPartials[- 8]; sPartials[0] = t; |
||||
t += sPartials[-16]; sPartials[0] = t; |
||||
} |
||||
else { |
||||
const int tid = threadIdx.x; |
||||
const int lane = tid & 31; |
||||
if ( lane >= 1 ) { t += sPartials[- 1]; sPartials[0] = t; } |
||||
if ( lane >= 2 ) { t += sPartials[- 2]; sPartials[0] = t; } |
||||
if ( lane >= 4 ) { t += sPartials[- 4]; sPartials[0] = t; } |
||||
if ( lane >= 8 ) { t += sPartials[- 8]; sPartials[0] = t; } |
||||
if ( lane >= 16 ) { t += sPartials[-16]; sPartials[0] = t; } |
||||
} |
||||
return t; |
||||
} |
||||
|
||||
#endif // __SCAN_WARP_CUH__ |
@ -0,0 +1,91 @@
@@ -0,0 +1,91 @@
|
||||
/* |
||||
* |
||||
* scanWarp2.cuh |
||||
* |
||||
* Alternative implementation of warp scan. |
||||
* |
||||
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in |
||||
* the documentation and/or other materials provided with the |
||||
* distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||
* POSSIBILITY OF SUCH DAMAGE. |
||||
* |
||||
*/ |
||||
|
||||
#ifndef __SCAN_WARP_CUH__ |
||||
#define __SCAN_WARP_CUH__ |
||||
|
||||
#if 0 |
||||
/* |
||||
* scanWarp - assumes no zero padding |
||||
*/ |
||||
template<class T> |
||||
inline __device__ T |
||||
scanWarp( volatile T *sPartials ) |
||||
{ |
||||
const int tid = threadIdx.x; |
||||
const int lane = tid & 31; |
||||
|
||||
if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; |
||||
if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; |
||||
if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; |
||||
if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; |
||||
if ( lane >= 16 ) sPartials[0] += sPartials[-16]; |
||||
return sPartials[0]; |
||||
} |
||||
#endif |
||||
|
||||
/* |
||||
* scanWarp - bZeroPadded template parameter specifies |
||||
* whether to conditionally add based on the lane ID. |
||||
* If we can assume that sPartials[-1..-16] is 0, |
||||
* the routine takes fewer instructions. |
||||
* idx is the base index of the warp to scan. |
||||
*/ |
||||
template<class T, bool bZeroPadded> |
||||
inline __device__ T |
||||
scanWarp( volatile T *sPartials ) |
||||
{ |
||||
if ( bZeroPadded ) { |
||||
T t = sPartials[0]; |
||||
sPartials[0] = t = t + sPartials[- 1]; |
||||
sPartials[0] = t = t + sPartials[- 2]; |
||||
sPartials[0] = t = t + sPartials[- 4]; |
||||
sPartials[0] = t = t + sPartials[- 8]; |
||||
sPartials[0] = t = t + sPartials[-16]; |
||||
} |
||||
else { |
||||
const int tid = threadIdx.x; |
||||
const int lane = tid & 31; |
||||
|
||||
if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; |
||||
if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; |
||||
if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; |
||||
if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; |
||||
if ( lane >= 16 ) sPartials[0] += sPartials[-16]; |
||||
} |
||||
return sPartials[0]; |
||||
} |
||||
|
||||
#endif // __SCAN_WARP_CUH__ |
@ -0,0 +1,212 @@
@@ -0,0 +1,212 @@
|
||||
/* |
||||
* |
||||
* testScan.cu |
||||
* |
||||
* Microdemo to test scan algorithms. |
||||
* |
||||
* Build with: nvcc -I ..\chLib <options> testScan.cu |
||||
* Requires: No minimum SM requirement. |
||||
* |
||||
* Copyright (c) 2011-2012, Archaea Software, LLC. |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in |
||||
* the documentation and/or other materials provided with the |
||||
* distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||
* POSSIBILITY OF SUCH DAMAGE. |
||||
* |
||||
*/ |
||||
|
||||
#include <stdlib.h> |
||||
|
||||
#include <thrust/host_vector.h> |
||||
#include <thrust/device_vector.h> |
||||
|
||||
#include <chTimer.h> |
||||
#include <chAssert.h> |
||||
#include <chError.h> |
||||
|
||||
#include "scanWarp.cuh" |
||||
#include "scanBlock.cuh" |
||||
|
||||
#include "scanZeroPad.cuh" |
||||
|
||||
#define min(a,b) ((a)<(b)?(a):(b)) |
||||
|
||||
int *g_hostIn, *g_hostOut; |
||||
|
||||
|
||||
enum ScanType { |
||||
Inclusive, Exclusive |
||||
}; |
||||
|
||||
#include "scanFan.cuh" |
||||
#include "scanReduceThenScan.cuh" |
||||
#include "scanReduceThenScan_0.cuh" |
||||
#include "scan2Level.cuh" |
||||
#include "ScanThrust.cuh" |
||||
|
||||
void |
||||
ScanExclusiveCPU( int *out, const int *in, size_t N ) |
||||
{ |
||||
int sum = 0; |
||||
for ( size_t i = 0; i < N; i++ ) { |
||||
int next = in[i]; // in case we are doing this in place |
||||
out[i] = sum; |
||||
sum += next; |
||||
} |
||||
} |
||||
|
||||
int |
||||
ScanInclusiveCPU( int *out, const int *in, size_t N ) |
||||
{ |
||||
int sum = 0; |
||||
for ( size_t i = 0; i < N; i++ ) { |
||||
sum += in[i]; |
||||
out[i] = sum; |
||||
} |
||||
return sum; |
||||
} |
||||
|
||||
void |
||||
RandomArray( int *out, size_t N, int modulus ) |
||||
{ |
||||
for ( size_t i = 0; i < N; i++ ) { |
||||
out[i] = rand() % modulus; |
||||
} |
||||
} |
||||
|
||||
template<class T> |
||||
bool |
||||
TestScan( const char *szScanFunction, |
||||
void (*pfnScanGPU)(T *, const T *, size_t, int), |
||||
size_t N, |
||||
int numThreads ) |
||||
{ |
||||
bool ret = false; |
||||
cudaError_t status; |
||||
int *inGPU = 0; |
||||
int *outGPU = 0; |
||||
int *inCPU = (T *) malloc( N*sizeof(T) ); |
||||
int *outCPU = (int *) malloc( N*sizeof(T) ); |
||||
int *hostGPU = (int *) malloc( N*sizeof(T) ); |
||||
if ( 0==inCPU || 0==outCPU || 0==hostGPU ) |
||||
goto Error; |
||||
|
||||
printf( "Testing %s (%d integers, %d threads/block)\n", |
||||
szScanFunction, |
||||
(int) N, |
||||
numThreads ); |
||||
|
||||
CUDART_CHECK( cudaMalloc( &inGPU, N*sizeof(T) ) ); |
||||
CUDART_CHECK( cudaMalloc( &outGPU, N*sizeof(T) ) ); |
||||
CUDART_CHECK( cudaMemset( inGPU, 0, N*sizeof(T) ) ); |
||||
CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) ); |
||||
|
||||
CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) ); |
||||
|
||||
RandomArray( inCPU, N, 256 ); |
||||
for ( int i = 0; i < N; i++ ) { |
||||
inCPU[i] = i; |
||||
} |
||||
|
||||
ScanInclusiveCPU( outCPU, inCPU, N ); |
||||
g_hostIn = inCPU; |
||||
|
||||
CUDART_CHECK( cudaMemcpy( inGPU, inCPU, N*sizeof(T), cudaMemcpyHostToDevice ) ); |
||||
pfnScanGPU( outGPU, inGPU, N, numThreads ); |
||||
CUDART_CHECK( cudaMemcpy( hostGPU, outGPU, N*sizeof(T), cudaMemcpyDeviceToHost ) ); |
||||
for ( size_t i = 0; i < N; i++ ) { |
||||
if ( hostGPU[i] != outCPU[i] ) { |
||||
printf( "Scan failed\n" ); |
||||
#ifdef _WIN32 |
||||
_asm int 3 |
||||
#else |
||||
assert(0); |
||||
#endif |
||||
goto Error; |
||||
} |
||||
} |
||||
ret = true; |
||||
Error: |
||||
cudaFree( outGPU ); |
||||
cudaFree( inGPU ); |
||||
free( inCPU ); |
||||
free( outCPU ); |
||||
free( hostGPU ); |
||||
return ret; |
||||
} |
||||
|
||||
int |
||||
main( int argc, char *argv[] ) |
||||
{ |
||||
cudaError_t status; |
||||
int maxThreads; |
||||
|
||||
CUDART_CHECK( cudaSetDevice( 0 ) ); |
||||
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); |
||||
|
||||
{ |
||||
cudaDeviceProp prop; |
||||
cudaGetDeviceProperties( &prop, 0 ); |
||||
maxThreads = prop.maxThreadsPerBlock; |
||||
} |
||||
|
||||
#define SCAN_TEST_VECTOR( Function, N, numThreads ) do { \ |
||||
srand(0); \ |
||||
bool bSuccess = TestScan<int>( #Function, Function, N, numThreads ); \ |
||||
if ( ! bSuccess ) { \ |
||||
printf( "%s failed: N=%d, numThreads=%d\n", #Function, N, numThreads ); \ |
||||
exit(1); \ |
||||
} \ |
||||
} while (0) |
||||
|
||||
for ( int numThreads = 256; numThreads <= maxThreads; numThreads *= 2 ) { |
||||
|
||||
for ( int numInts = 256; numInts <= 2048; numInts += 128 ) { |
||||
|
||||
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads ); |
||||
|
||||
SCAN_TEST_VECTOR( scanFan<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scanReduceThenScan<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scanReduceThenScan_0<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scan2Level_0<int>, numInts, numThreads ); |
||||
} |
||||
|
||||
for ( int numInts = 33*1048576-1; numInts < 33*1048576+1; numInts++ ) { |
||||
|
||||
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scan2Level_0<int>, numInts, numThreads ); |
||||
|
||||
SCAN_TEST_VECTOR( scanFan<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scanReduceThenScan<int>, numInts, numThreads ); |
||||
SCAN_TEST_VECTOR( scanReduceThenScan_0<int>, numInts, numThreads ); |
||||
|
||||
SCAN_TEST_VECTOR( ScanThrust<int>, numInts, numThreads ); |
||||
} |
||||
|
||||
} |
||||
return 0; |
||||
Error: |
||||
return 1; |
||||
} |
Loading…
Reference in new issue