Browse Source

Checkpoint warp scan code

pull/1/head
Nicholas Wilt 13 years ago
parent
commit
a3f09477ba
  1. 89
      scan/warp/scanWarp.cuh
  2. 91
      scan/warp/scanWarp2.cuh
  3. 212
      scan/warp/testScanWarp.cu

89
scan/warp/scanWarp.cuh

@ -0,0 +1,89 @@ @@ -0,0 +1,89 @@
/*
*
* scanWarp.cuh
*
* Device functions to perform scan within a 32-thread warp.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifndef __SCAN_WARP_CUH__
#define __SCAN_WARP_CUH__
#if 0
/*
* scanWarp - assumes no zero padding
*/
template<class T>
inline __device__ T
scanWarp( volatile T *sPartials )
{
const int tid = threadIdx.x;
const int lane = tid & 31;
if ( lane >= 1 ) sPartials[0] += sPartials[- 1];
if ( lane >= 2 ) sPartials[0] += sPartials[- 2];
if ( lane >= 4 ) sPartials[0] += sPartials[- 4];
if ( lane >= 8 ) sPartials[0] += sPartials[- 8];
if ( lane >= 16 ) sPartials[0] += sPartials[-16];
return sPartials[0];
}
#endif
/*
* scanWarp - bZeroPadded template parameter specifies
* whether to conditionally add based on the lane ID.
* If we can assume that sPartials[-1..-16] is 0,
* the routine takes fewer instructions.
*/
template<class T, bool bZeroPadded>
inline __device__ T
scanWarp( volatile T *sPartials )
{
T t = sPartials[0];
if ( bZeroPadded ) {
t += sPartials[- 1]; sPartials[0] = t;
t += sPartials[- 2]; sPartials[0] = t;
t += sPartials[- 4]; sPartials[0] = t;
t += sPartials[- 8]; sPartials[0] = t;
t += sPartials[-16]; sPartials[0] = t;
}
else {
const int tid = threadIdx.x;
const int lane = tid & 31;
if ( lane >= 1 ) { t += sPartials[- 1]; sPartials[0] = t; }
if ( lane >= 2 ) { t += sPartials[- 2]; sPartials[0] = t; }
if ( lane >= 4 ) { t += sPartials[- 4]; sPartials[0] = t; }
if ( lane >= 8 ) { t += sPartials[- 8]; sPartials[0] = t; }
if ( lane >= 16 ) { t += sPartials[-16]; sPartials[0] = t; }
}
return t;
}
#endif // __SCAN_WARP_CUH__

91
scan/warp/scanWarp2.cuh

@ -0,0 +1,91 @@ @@ -0,0 +1,91 @@
/*
*
* scanWarp2.cuh
*
* Alternative implementation of warp scan.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifndef __SCAN_WARP_CUH__
#define __SCAN_WARP_CUH__
#if 0
/*
* scanWarp - assumes no zero padding
*/
template<class T>
inline __device__ T
scanWarp( volatile T *sPartials )
{
const int tid = threadIdx.x;
const int lane = tid & 31;
if ( lane >= 1 ) sPartials[0] += sPartials[- 1];
if ( lane >= 2 ) sPartials[0] += sPartials[- 2];
if ( lane >= 4 ) sPartials[0] += sPartials[- 4];
if ( lane >= 8 ) sPartials[0] += sPartials[- 8];
if ( lane >= 16 ) sPartials[0] += sPartials[-16];
return sPartials[0];
}
#endif
/*
* scanWarp - bZeroPadded template parameter specifies
* whether to conditionally add based on the lane ID.
* If we can assume that sPartials[-1..-16] is 0,
* the routine takes fewer instructions.
* idx is the base index of the warp to scan.
*/
template<class T, bool bZeroPadded>
inline __device__ T
scanWarp( volatile T *sPartials )
{
if ( bZeroPadded ) {
T t = sPartials[0];
sPartials[0] = t = t + sPartials[- 1];
sPartials[0] = t = t + sPartials[- 2];
sPartials[0] = t = t + sPartials[- 4];
sPartials[0] = t = t + sPartials[- 8];
sPartials[0] = t = t + sPartials[-16];
}
else {
const int tid = threadIdx.x;
const int lane = tid & 31;
if ( lane >= 1 ) sPartials[0] += sPartials[- 1];
if ( lane >= 2 ) sPartials[0] += sPartials[- 2];
if ( lane >= 4 ) sPartials[0] += sPartials[- 4];
if ( lane >= 8 ) sPartials[0] += sPartials[- 8];
if ( lane >= 16 ) sPartials[0] += sPartials[-16];
}
return sPartials[0];
}
#endif // __SCAN_WARP_CUH__

212
scan/warp/testScanWarp.cu

@ -0,0 +1,212 @@ @@ -0,0 +1,212 @@
/*
*
* testScan.cu
*
* Microdemo to test scan algorithms.
*
* Build with: nvcc -I ..\chLib <options> testScan.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <chTimer.h>
#include <chAssert.h>
#include <chError.h>
#include "scanWarp.cuh"
#include "scanBlock.cuh"
#include "scanZeroPad.cuh"
#define min(a,b) ((a)<(b)?(a):(b))
int *g_hostIn, *g_hostOut;
enum ScanType {
Inclusive, Exclusive
};
#include "scanFan.cuh"
#include "scanReduceThenScan.cuh"
#include "scanReduceThenScan_0.cuh"
#include "scan2Level.cuh"
#include "ScanThrust.cuh"
void
ScanExclusiveCPU( int *out, const int *in, size_t N )
{
int sum = 0;
for ( size_t i = 0; i < N; i++ ) {
int next = in[i]; // in case we are doing this in place
out[i] = sum;
sum += next;
}
}
int
ScanInclusiveCPU( int *out, const int *in, size_t N )
{
int sum = 0;
for ( size_t i = 0; i < N; i++ ) {
sum += in[i];
out[i] = sum;
}
return sum;
}
void
RandomArray( int *out, size_t N, int modulus )
{
for ( size_t i = 0; i < N; i++ ) {
out[i] = rand() % modulus;
}
}
template<class T>
bool
TestScan( const char *szScanFunction,
void (*pfnScanGPU)(T *, const T *, size_t, int),
size_t N,
int numThreads )
{
bool ret = false;
cudaError_t status;
int *inGPU = 0;
int *outGPU = 0;
int *inCPU = (T *) malloc( N*sizeof(T) );
int *outCPU = (int *) malloc( N*sizeof(T) );
int *hostGPU = (int *) malloc( N*sizeof(T) );
if ( 0==inCPU || 0==outCPU || 0==hostGPU )
goto Error;
printf( "Testing %s (%d integers, %d threads/block)\n",
szScanFunction,
(int) N,
numThreads );
CUDART_CHECK( cudaMalloc( &inGPU, N*sizeof(T) ) );
CUDART_CHECK( cudaMalloc( &outGPU, N*sizeof(T) ) );
CUDART_CHECK( cudaMemset( inGPU, 0, N*sizeof(T) ) );
CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) );
CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) );
RandomArray( inCPU, N, 256 );
for ( int i = 0; i < N; i++ ) {
inCPU[i] = i;
}
ScanInclusiveCPU( outCPU, inCPU, N );
g_hostIn = inCPU;
CUDART_CHECK( cudaMemcpy( inGPU, inCPU, N*sizeof(T), cudaMemcpyHostToDevice ) );
pfnScanGPU( outGPU, inGPU, N, numThreads );
CUDART_CHECK( cudaMemcpy( hostGPU, outGPU, N*sizeof(T), cudaMemcpyDeviceToHost ) );
for ( size_t i = 0; i < N; i++ ) {
if ( hostGPU[i] != outCPU[i] ) {
printf( "Scan failed\n" );
#ifdef _WIN32
_asm int 3
#else
assert(0);
#endif
goto Error;
}
}
ret = true;
Error:
cudaFree( outGPU );
cudaFree( inGPU );
free( inCPU );
free( outCPU );
free( hostGPU );
return ret;
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int maxThreads;
CUDART_CHECK( cudaSetDevice( 0 ) );
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
{
cudaDeviceProp prop;
cudaGetDeviceProperties( &prop, 0 );
maxThreads = prop.maxThreadsPerBlock;
}
#define SCAN_TEST_VECTOR( Function, N, numThreads ) do { \
srand(0); \
bool bSuccess = TestScan<int>( #Function, Function, N, numThreads ); \
if ( ! bSuccess ) { \
printf( "%s failed: N=%d, numThreads=%d\n", #Function, N, numThreads ); \
exit(1); \
} \
} while (0)
for ( int numThreads = 256; numThreads <= maxThreads; numThreads *= 2 ) {
for ( int numInts = 256; numInts <= 2048; numInts += 128 ) {
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanFan<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanReduceThenScan<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanReduceThenScan_0<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scan2Level_0<int>, numInts, numThreads );
}
for ( int numInts = 33*1048576-1; numInts < 33*1048576+1; numInts++ ) {
SCAN_TEST_VECTOR( scan2Level<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scan2Level_0<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanFan<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanReduceThenScan<int>, numInts, numThreads );
SCAN_TEST_VECTOR( scanReduceThenScan_0<int>, numInts, numThreads );
SCAN_TEST_VECTOR( ScanThrust<int>, numInts, numThreads );
}
}
return 0;
Error:
return 1;
}
Loading…
Cancel
Save