From a3f09477ba6625f692c3826e7dfd67097f802d58 Mon Sep 17 00:00:00 2001 From: Nicholas Wilt Date: Wed, 31 Oct 2012 18:37:08 -0700 Subject: [PATCH] Checkpoint warp scan code --- scan/warp/scanWarp.cuh | 89 ++++++++++++++++ scan/warp/scanWarp2.cuh | 91 ++++++++++++++++ scan/warp/testScanWarp.cu | 212 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 392 insertions(+) create mode 100644 scan/warp/scanWarp.cuh create mode 100644 scan/warp/scanWarp2.cuh create mode 100644 scan/warp/testScanWarp.cu diff --git a/scan/warp/scanWarp.cuh b/scan/warp/scanWarp.cuh new file mode 100644 index 0000000..8a3271a --- /dev/null +++ b/scan/warp/scanWarp.cuh @@ -0,0 +1,89 @@ +/* + * + * scanWarp.cuh + * + * Device functions to perform scan within a 32-thread warp. + * + * Copyright (c) 2011-2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef __SCAN_WARP_CUH__ +#define __SCAN_WARP_CUH__ + +#if 0 +/* + * scanWarp - assumes no zero padding + */ +template +inline __device__ T +scanWarp( volatile T *sPartials ) +{ + const int tid = threadIdx.x; + const int lane = tid & 31; + + if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; + if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; + if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; + if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; + if ( lane >= 16 ) sPartials[0] += sPartials[-16]; + return sPartials[0]; +} +#endif + +/* + * scanWarp - bZeroPadded template parameter specifies + * whether to conditionally add based on the lane ID. + * If we can assume that sPartials[-1..-16] is 0, + * the routine takes fewer instructions. + */ +template +inline __device__ T +scanWarp( volatile T *sPartials ) +{ + T t = sPartials[0]; + if ( bZeroPadded ) { + t += sPartials[- 1]; sPartials[0] = t; + t += sPartials[- 2]; sPartials[0] = t; + t += sPartials[- 4]; sPartials[0] = t; + t += sPartials[- 8]; sPartials[0] = t; + t += sPartials[-16]; sPartials[0] = t; + } + else { + const int tid = threadIdx.x; + const int lane = tid & 31; + if ( lane >= 1 ) { t += sPartials[- 1]; sPartials[0] = t; } + if ( lane >= 2 ) { t += sPartials[- 2]; sPartials[0] = t; } + if ( lane >= 4 ) { t += sPartials[- 4]; sPartials[0] = t; } + if ( lane >= 8 ) { t += sPartials[- 8]; sPartials[0] = t; } + if ( lane >= 16 ) { t += sPartials[-16]; sPartials[0] = t; } + } + return t; +} + +#endif // __SCAN_WARP_CUH__ diff --git a/scan/warp/scanWarp2.cuh b/scan/warp/scanWarp2.cuh new file mode 100644 index 0000000..c08c3cb --- /dev/null +++ b/scan/warp/scanWarp2.cuh @@ -0,0 +1,91 @@ +/* + * + * scanWarp2.cuh + * + * Alternative implementation of warp scan. + * + * Copyright (c) 2011-2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef __SCAN_WARP_CUH__ +#define __SCAN_WARP_CUH__ + +#if 0 +/* + * scanWarp - assumes no zero padding + */ +template +inline __device__ T +scanWarp( volatile T *sPartials ) +{ + const int tid = threadIdx.x; + const int lane = tid & 31; + + if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; + if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; + if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; + if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; + if ( lane >= 16 ) sPartials[0] += sPartials[-16]; + return sPartials[0]; +} +#endif + +/* + * scanWarp - bZeroPadded template parameter specifies + * whether to conditionally add based on the lane ID. + * If we can assume that sPartials[-1..-16] is 0, + * the routine takes fewer instructions. + * idx is the base index of the warp to scan. + */ +template +inline __device__ T +scanWarp( volatile T *sPartials ) +{ + if ( bZeroPadded ) { + T t = sPartials[0]; + sPartials[0] = t = t + sPartials[- 1]; + sPartials[0] = t = t + sPartials[- 2]; + sPartials[0] = t = t + sPartials[- 4]; + sPartials[0] = t = t + sPartials[- 8]; + sPartials[0] = t = t + sPartials[-16]; + } + else { + const int tid = threadIdx.x; + const int lane = tid & 31; + + if ( lane >= 1 ) sPartials[0] += sPartials[- 1]; + if ( lane >= 2 ) sPartials[0] += sPartials[- 2]; + if ( lane >= 4 ) sPartials[0] += sPartials[- 4]; + if ( lane >= 8 ) sPartials[0] += sPartials[- 8]; + if ( lane >= 16 ) sPartials[0] += sPartials[-16]; + } + return sPartials[0]; +} + +#endif // __SCAN_WARP_CUH__ diff --git a/scan/warp/testScanWarp.cu b/scan/warp/testScanWarp.cu new file mode 100644 index 0000000..ee47b4c --- /dev/null +++ b/scan/warp/testScanWarp.cu @@ -0,0 +1,212 @@ +/* + * + * testScan.cu + * + * Microdemo to test scan algorithms. + * + * Build with: nvcc -I ..\chLib testScan.cu + * Requires: No minimum SM requirement. + * + * Copyright (c) 2011-2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#include +#include + +#include +#include +#include + +#include "scanWarp.cuh" +#include "scanBlock.cuh" + +#include "scanZeroPad.cuh" + +#define min(a,b) ((a)<(b)?(a):(b)) + +int *g_hostIn, *g_hostOut; + + +enum ScanType { + Inclusive, Exclusive +}; + +#include "scanFan.cuh" +#include "scanReduceThenScan.cuh" +#include "scanReduceThenScan_0.cuh" +#include "scan2Level.cuh" +#include "ScanThrust.cuh" + +void +ScanExclusiveCPU( int *out, const int *in, size_t N ) +{ + int sum = 0; + for ( size_t i = 0; i < N; i++ ) { + int next = in[i]; // in case we are doing this in place + out[i] = sum; + sum += next; + } +} + +int +ScanInclusiveCPU( int *out, const int *in, size_t N ) +{ + int sum = 0; + for ( size_t i = 0; i < N; i++ ) { + sum += in[i]; + out[i] = sum; + } + return sum; +} + +void +RandomArray( int *out, size_t N, int modulus ) +{ + for ( size_t i = 0; i < N; i++ ) { + out[i] = rand() % modulus; + } +} + +template +bool +TestScan( const char *szScanFunction, + void (*pfnScanGPU)(T *, const T *, size_t, int), + size_t N, + int numThreads ) +{ + bool ret = false; + cudaError_t status; + int *inGPU = 0; + int *outGPU = 0; + int *inCPU = (T *) malloc( N*sizeof(T) ); + int *outCPU = (int *) malloc( N*sizeof(T) ); + int *hostGPU = (int *) malloc( N*sizeof(T) ); + if ( 0==inCPU || 0==outCPU || 0==hostGPU ) + goto Error; + + printf( "Testing %s (%d integers, %d threads/block)\n", + szScanFunction, + (int) N, + numThreads ); + + CUDART_CHECK( cudaMalloc( &inGPU, N*sizeof(T) ) ); + CUDART_CHECK( cudaMalloc( &outGPU, N*sizeof(T) ) ); + CUDART_CHECK( cudaMemset( inGPU, 0, N*sizeof(T) ) ); + CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) ); + + CUDART_CHECK( cudaMemset( outGPU, 0, N*sizeof(T) ) ); + + RandomArray( inCPU, N, 256 ); +for ( int i = 0; i < N; i++ ) { + inCPU[i] = i; +} + + ScanInclusiveCPU( outCPU, inCPU, N ); +g_hostIn = inCPU; + + CUDART_CHECK( cudaMemcpy( inGPU, inCPU, N*sizeof(T), cudaMemcpyHostToDevice ) ); + pfnScanGPU( outGPU, inGPU, N, numThreads ); + CUDART_CHECK( cudaMemcpy( hostGPU, outGPU, N*sizeof(T), cudaMemcpyDeviceToHost ) ); + for ( size_t i = 0; i < N; i++ ) { + if ( hostGPU[i] != outCPU[i] ) { + printf( "Scan failed\n" ); +#ifdef _WIN32 + _asm int 3 +#else + assert(0); +#endif + goto Error; + } + } + ret = true; +Error: + cudaFree( outGPU ); + cudaFree( inGPU ); + free( inCPU ); + free( outCPU ); + free( hostGPU ); + return ret; +} + +int +main( int argc, char *argv[] ) +{ + cudaError_t status; + int maxThreads; + + CUDART_CHECK( cudaSetDevice( 0 ) ); + CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); + + { + cudaDeviceProp prop; + cudaGetDeviceProperties( &prop, 0 ); + maxThreads = prop.maxThreadsPerBlock; + } + +#define SCAN_TEST_VECTOR( Function, N, numThreads ) do { \ + srand(0); \ + bool bSuccess = TestScan( #Function, Function, N, numThreads ); \ + if ( ! bSuccess ) { \ + printf( "%s failed: N=%d, numThreads=%d\n", #Function, N, numThreads ); \ + exit(1); \ + } \ +} while (0) + + for ( int numThreads = 256; numThreads <= maxThreads; numThreads *= 2 ) { + + for ( int numInts = 256; numInts <= 2048; numInts += 128 ) { + + SCAN_TEST_VECTOR( scan2Level, numInts, numThreads ); + + SCAN_TEST_VECTOR( scanFan, numInts, numThreads ); + SCAN_TEST_VECTOR( scanReduceThenScan, numInts, numThreads ); + SCAN_TEST_VECTOR( scanReduceThenScan_0, numInts, numThreads ); + SCAN_TEST_VECTOR( scan2Level, numInts, numThreads ); + SCAN_TEST_VECTOR( scan2Level_0, numInts, numThreads ); + } + + for ( int numInts = 33*1048576-1; numInts < 33*1048576+1; numInts++ ) { + + SCAN_TEST_VECTOR( scan2Level, numInts, numThreads ); + SCAN_TEST_VECTOR( scan2Level_0, numInts, numThreads ); + + SCAN_TEST_VECTOR( scanFan, numInts, numThreads ); + SCAN_TEST_VECTOR( scanReduceThenScan, numInts, numThreads ); + SCAN_TEST_VECTOR( scanReduceThenScan_0, numInts, numThreads ); + + SCAN_TEST_VECTOR( ScanThrust, numInts, numThreads ); + } + + } + return 0; +Error: + return 1; +}