Fix bit rot, update to use modern idioms

4 years ago · f9a5490496
56 changed files with 175 additions and 126 deletions
--- a/SMs/divergence.cu
+++ b/SMs/divergence.cu
@ -4,7 +4,7 @@
 *
 * Microdemo to measure performance implications of conditional code.
 *
- * Build with: nvcc [--gpu-architecture sm_xx] [-D USE_FLOAT] [-D USE_IF_STATEMENT] divergence.cu
+ * Build with: nvcc [--gpu-architecture sm_xx] divergence.cu
 * Requires: No minimum SM requirement.
 *
 * Copyright (c) 2021, Archaea Software, LLC.
@ -46,66 +46,79 @@
 // apply blockDim and/or gridDim to n before passing in.
 //
 template<int base>
-void __device__ sumFloats( float *p, size_t N, size_t n )
+void __device__ sumInts( uint32_t *p, size_t N, size_t n )
 {
    float f = base;
    for ( size_t i = 0; i < N; i++ ) {
-        *p += f;
+        *p += base;
        p += n;
    }
 }
-typedef void(*psumFloats)(float *, size_t, size_t);
+typedef void(*psumInts)(uint32_t *, size_t, size_t);
-__device__ psumFloats rgSumFloats[] = {
+__device__ psumInts rgSumInts[] = {
-    sumFloats< 0>, sumFloats< 1>, sumFloats< 2>, sumFloats< 3>,
+    sumInts< 0>, sumInts< 1>, sumInts< 2>, sumInts< 3>,
-    sumFloats< 4>, sumFloats< 5>, sumFloats< 6>, sumFloats< 7>,
+    sumInts< 4>, sumInts< 5>, sumInts< 6>, sumInts< 7>,
-    sumFloats< 8>, sumFloats< 9>, sumFloats<10>, sumFloats<11>,
+    sumInts< 8>, sumInts< 9>, sumInts<10>, sumInts<11>,
-    sumFloats<12>, sumFloats<13>, sumFloats<14>, sumFloats<15>,
+    sumInts<12>, sumInts<13>, sumInts<14>, sumInts<15>,
-    sumFloats<16>, sumFloats<17>, sumFloats<18>, sumFloats<19>,
+    sumInts<16>, sumInts<17>, sumInts<18>, sumInts<19>,
-    sumFloats<20>, sumFloats<21>, sumFloats<22>, sumFloats<23>,
+    sumInts<20>, sumInts<21>, sumInts<22>, sumInts<23>,
-    sumFloats<24>, sumFloats<25>, sumFloats<26>, sumFloats<27>,
+    sumInts<24>, sumInts<25>, sumInts<26>, sumInts<27>,
-    sumFloats<28>, sumFloats<29>, sumFloats<30>, sumFloats<31> };
+    sumInts<28>, sumInts<29>, sumInts<30>, sumInts<31> };
 template<uint32_t sh>
 __global__ void
-sumFloats_bywarp( float *p, size_t N )
+sumInts_bythread( uint32_t *p, size_t N )
 {
-    uint32_t warpid = threadIdx.x>>5;
+    uint32_t warpish_id = threadIdx.x>>sh;
    N /= blockDim.x*gridDim.x;
-    rgSumFloats[warpid]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
+    rgSumInts[warpish_id&31]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
 }
-__global__ void
+template<uint32_t sh>
-sumFloats_bythread( float *p, size_t N )
+static double
-{
+timeByThreads( uint32_t *p, size_t N )
 }
 int
 main()
 {
    cudaError_t status;
-    size_t N = 1024*1024*1024UL;
+    float elapsed_time;
-    float *p = 0;
+    double ret = 0.0;
    float et;
    cudaEvent_t start = 0, stop = 0;
    cuda(Malloc( (void **) &p, N*sizeof(float)) );
    cuda(Memset( p, 0, N*sizeof(float)) );
    cuda(EventCreate( &start ));
    cuda(EventCreate( &stop ));
    cuda(EventRecord( start ));
-    sumFloats_bywarp<<<3072,256>>>( p, N );
+    sumInts_bythread<sh><<<3072,1024>>>( p, N );
    cuda(EventRecord( stop ));
    cuda(DeviceSynchronize());
-    cuda(EventElapsedTime( &et, start, stop ));
+    cuda(EventElapsedTime( &elapsed_time, start, stop ));
-    
+    ret = N*1000.0/elapsed_time/1e9;
-    printf( "%.2f ms = %.2f Gops/s\n", et, (double) N*1000.0/et/1e9 );
+    printf( "%2d threads: %f Gops/s\n", 1<<sh, ret );
-
+Error:
    cudaFree( p );
    cudaEventDestroy( stop );
    cudaEventDestroy( start );
    return ret;
 }
 int
 main()
 {
    cudaError_t status;
    size_t N = 1024*1024*1024UL;
    uint32_t *p = 0;
    cuda(Malloc( (void **) &p, N*sizeof(uint32_t)) );
    cuda(Memset( p, 0, N*sizeof(uint32_t)) );
    timeByThreads<6>( p, N );
    timeByThreads<5>( p, N );
    timeByThreads<4>( p, N );
    timeByThreads<3>( p, N );
    timeByThreads<2>( p, N );
    timeByThreads<1>( p, N );
    timeByThreads<0>( p, N );
    cudaFree( p );
    return 0;
 Error:
    return 1;
--- a/SMs/testShuffle.cu
+++ b/SMs/testShuffle.cu
@ -48,7 +48,7 @@ TestShuffle( int *out, const int *in, size_t N  )
    size_t i = blockIdx.x*blockDim.x+threadIdx.x;
    int value = (int) i;//in[i];
-    out[i] = __shfl_up( value, 1 );
+    out[i] = __shfl_up_sync( 0xffffffff, value, 1 );
 }
 cudaError_t
--- a/chLib/chError.h
+++ b/chLib/chError.h
@ -112,6 +112,40 @@ chGetErrorString( CUresult status )
        ErrorValue(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
        ErrorValue(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED)
 #endif
        ErrorValue(CUDA_ERROR_STUB_LIBRARY)
        ErrorValue(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
        ErrorValue(CUDA_ERROR_DEVICE_NOT_LICENSED)
        ErrorValue(CUDA_ERROR_INVALID_PTX)
        ErrorValue(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT)
        ErrorValue(CUDA_ERROR_NVLINK_UNCORRECTABLE)
        ErrorValue(CUDA_ERROR_JIT_COMPILER_NOT_FOUND)
        ErrorValue(CUDA_ERROR_JIT_COMPILATION_DISABLED)
        ErrorValue(CUDA_ERROR_UNSUPPORTED_PTX_VERSION)
        ErrorValue(CUDA_ERROR_ILLEGAL_STATE)
        ErrorValue(CUDA_ERROR_ILLEGAL_ADDRESS)
        ErrorValue(CUDA_ERROR_HARDWARE_STACK_ERROR)
        ErrorValue(CUDA_ERROR_ILLEGAL_INSTRUCTION)
        ErrorValue(CUDA_ERROR_MISALIGNED_ADDRESS)
        ErrorValue(CUDA_ERROR_INVALID_ADDRESS_SPACE)
        ErrorValue(CUDA_ERROR_INVALID_PC)
        ErrorValue(CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
        ErrorValue(CUDA_ERROR_NOT_PERMITTED)
        ErrorValue(CUDA_ERROR_NOT_SUPPORTED)
        ErrorValue(CUDA_ERROR_SYSTEM_NOT_READY)
        ErrorValue(CUDA_ERROR_SYSTEM_DRIVER_MISMATCH)
        ErrorValue(CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_INVALIDATED)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_MERGE)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNMATCHED)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNJOINED)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_ISOLATION)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_IMPLICIT)
        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD)
        ErrorValue(CUDA_ERROR_TIMEOUT)
        ErrorValue(CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE)
        ErrorValue(CUDA_ERROR_CAPTURED_EVENT)
        ErrorValue(CUDA_ERROR_UNKNOWN)
    }
    return "chGetErrorString - unknown error value";
--- a/concurrency/breakevenDtoHMemcpy.cu
+++ b/concurrency/breakevenDtoHMemcpy.cu
@ -70,7 +70,7 @@ main( int argc, char *argv[] )
            cuda(MemcpyAsync( hostInt, deviceInt, byteCount, 
                cudaMemcpyDeviceToHost, NULL ) );
        }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
        chTimerGetTime( &stop );
        {
--- a/concurrency/breakevenHtoDMemcpy.cu
+++ b/concurrency/breakevenHtoDMemcpy.cu
@ -70,7 +70,7 @@ main( int argc, char *argv[] )
            cuda(MemcpyAsync( deviceInt, hostInt, byteCount, 
                cudaMemcpyHostToDevice, NULL ) );
        }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
        chTimerGetTime( &stop );
        {
--- a/concurrency/breakevenKernelAsync.cu
+++ b/concurrency/breakevenKernelAsync.cu
@ -67,7 +67,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        WaitKernel<<<1,1>>>( 0, false );
    }
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
    printf("Cycles\tus\n" );
    for ( int cycles = 0; cycles < 2500; cycles += 100 ) {
@ -76,7 +76,7 @@ main( int argc, char *argv[] )
        for ( int i = 0; i < cIterations; i++ ) {
            WaitKernel<<<1,1>>>( cycles, false );
        }
-        cudaThreadSynchronize();
+        cudaDeviceSynchronize();
        chTimerGetTime( &stop );
        double microseconds = 1e6*chTimerElapsedTime( &start, &stop );
        double usPerLaunch = microseconds / (float) cIterations;
--- a/concurrency/eventRecord.cu
+++ b/concurrency/eventRecord.cu
@ -72,7 +72,7 @@ usPerLaunch( int cIterations, int cEvents )
            cuda(EventRecord( events[j], NULL ) );
        }
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    microseconds = 1e6*chTimerElapsedTime( &start, &stop );
--- a/concurrency/managedOverhead.cu
+++ b/concurrency/managedOverhead.cu
@ -73,7 +73,7 @@ usPerLaunch( int cIterations, size_t cPages=0 )
    chTimerGetTime( &start );
    for ( int i = 0; i < cIterations; i++ ) {
        NullKernel<<<1,1>>>();
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
        if ( bTouch && 0 != p ) {
            for ( int iPage = 0; iPage < cPages; iPage++ ) {
                ((volatile unsigned char *) p)[iPage*pageSize] |= 1;
--- a/concurrency/nullDtoHMemcpyAsync.cu
+++ b/concurrency/nullDtoHMemcpyAsync.cu
@ -60,7 +60,7 @@ main( int argc, char *argv[] )
        cuda(MemcpyAsync( hostInt, deviceInt, sizeof(int), 
            cudaMemcpyDeviceToHost, NULL ) );
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/nullDtoHMemcpySync.cu
+++ b/concurrency/nullDtoHMemcpySync.cu
@ -61,7 +61,7 @@ main( int argc, char *argv[] )
            cuda(Memcpy( hostInt, deviceInt, sizeof(int), 
                cudaMemcpyDeviceToHost ) );
        }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
        chTimerGetTime( &stop );
        cIterations *= 2;
    } while ( chTimerElapsedTime( &start, &stop ) < 0.5f ) ;
--- a/concurrency/nullHtoDMemcpyAsync.cu
+++ b/concurrency/nullHtoDMemcpyAsync.cu
@ -60,7 +60,7 @@ main( int argc, char *argv[] )
        cuda(MemcpyAsync( deviceInt, hostInt, sizeof(int), 
            cudaMemcpyHostToDevice, NULL ) );
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/nullKernelAsync.cu
+++ b/concurrency/nullKernelAsync.cu
@ -60,7 +60,7 @@ usPerLaunch( int cIterations )
    for ( int i = 0; i < cIterations; i++ ) {
        NullKernel<<<1,1>>>();
    }
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    chTimerGetTime( &stop );
    microseconds = 1e6*chTimerElapsedTime( &start, &stop );
--- a/concurrency/nullKernelSync.cu
+++ b/concurrency/nullKernelSync.cu
@ -59,7 +59,7 @@ usPerLaunch( int cIterations )
    chTimerGetTime( &start );
    for ( int i = 0; i < cIterations; i++ ) {
        NullKernel<<<1,1>>>();
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
    }
    chTimerGetTime( &stop );
--- a/concurrency/pageableMemcpyHtoD.cu
+++ b/concurrency/pageableMemcpyHtoD.cu
@ -145,7 +145,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/pageableMemcpyHtoD16.cu
+++ b/concurrency/pageableMemcpyHtoD16.cu
@ -157,7 +157,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/pageableMemcpyHtoD16Blocking.cu
+++ b/concurrency/pageableMemcpyHtoD16Blocking.cu
@ -156,7 +156,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/pageableMemcpyHtoD16Broken.cu
+++ b/concurrency/pageableMemcpyHtoD16Broken.cu
@ -157,7 +157,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/pageableMemcpyHtoD16Synchronous.cu
+++ b/concurrency/pageableMemcpyHtoD16Synchronous.cu
@ -157,7 +157,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/concurrency/peer2peerMemcpy.cu
+++ b/concurrency/peer2peerMemcpy.cu
@ -195,7 +195,7 @@ main( int argc, char *argv[] )
    for ( int i = 0; i < cIterations; i++ ) {
        chMemcpyPeerToPeer( deviceInt[0], 0, deviceInt[1], 1, numInts*sizeof(int) ) ;
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    {
--- a/corr/normalizedCrossCorrelation.cu
+++ b/corr/normalizedCrossCorrelation.cu
@ -476,7 +476,8 @@ main(int argc, char *argv[])
    int sharedPitch;
    int sharedMem;
-    char *inputFilename = "coins.pgm";
+    char defaultInputFilename[] = "coins.pgm";
    char *inputFilename = defaultInputFilename;
    char *outputFilename = NULL;
    cudaArray *pArrayImage = NULL;
--- a/histogram/histogram.cu
+++ b/histogram/histogram.cu
@ -261,7 +261,8 @@ main(int argc, char *argv[])
    dim3 threads;
-    char *inputFilename = "coins.pgm";
+    char defaultInputFilename[] = "coins.pgm";
    char *inputFilename = defaultInputFilename;
    char *outputFilename = NULL;
    cudaArray *pArrayImage = NULL;
--- a/memory/globalCopy.cu
+++ b/memory/globalCopy.cu
@ -114,7 +114,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn,
        GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/memory/globalCopy2.cu
+++ b/memory/globalCopy2.cu
@ -160,7 +160,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn0, T *deviceIn1,
        GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn0+bOffsetSrc, deviceIn1+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/memory/globalRead.cu
+++ b/memory/globalRead.cu
@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
        GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/memory/globalReadTex.cu
+++ b/memory/globalReadTex.cu
@ -236,7 +236,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
        GlobalReads<T,n><<<cBlocks,cThreads>>>( out, bOffset, N-bOffset, false );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/memory/globalWrite.cu
+++ b/memory/globalWrite.cu
@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
        GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/memory/spinlockReduction.cu
+++ b/memory/spinlockReduction.cu
@ -208,7 +208,7 @@ AtomicsPerSecond( size_t N, int cBlocks, int cThreads )
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
--- a/microbench/globalRead.cu
+++ b/microbench/globalRead.cu
@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
        GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/microbench/globalWrite.cu
+++ b/microbench/globalWrite.cu
@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
        GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
    }
    cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
    cuda(EventElapsedTime( &ms, evStart, evStop ) );
--- a/microbench/reportClocks.cu
+++ b/microbench/reportClocks.cu
@ -97,13 +97,13 @@ ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSi
    cuda(EventCreate( &stop ) );
    WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    cuda(EventRecord( start, 0 ) );
    WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
    cuda(EventRecord( stop, 0 ) );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    {
        float ms;
--- a/nbody/nbody.cu
+++ b/nbody/nbody.cu
@ -7,8 +7,8 @@
 * parallelizable, with lots of FLOPS per unit of external 
 * memory bandwidth required.
 *
- * Build with: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
+ * Build with: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
- *   On Linux: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lpthread -lrt
+ *   On Linux: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lcudart_static -ldl -lrt
 * Requires: No minimum SM requirement.  If SM 3.x is not available,
 * this application quietly replaces the shuffle and fast-atomic
 * implementations with the shared memory implementation.
@ -482,7 +482,7 @@ ComputeGravitation(
            sumY += g_hostAOS_Force[i*3+1];
            sumZ += g_hostAOS_Force[i*3+2];
        }
-        *maxRelError = max( fabs(sumX), max(fabs(sumY), fabs(sumZ)) );
+        *maxRelError = std::max( fabs(sumX), std::max(fabs(sumY), fabs(sumZ)) );
        if ( g_ZeroThreshold != 0.0 && 
             fabs( *maxRelError ) > g_ZeroThreshold ) {
            printf( "Maximum sum of forces > threshold (%E > %E)\n",
--- a/nbody/nbody_GPU_AOS_tiled.cuh
+++ b/nbody/nbody_GPU_AOS_tiled.cuh
@ -78,11 +78,11 @@ inline float
 __device__
 warpReduce( float x )
 {
-    x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  8 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  8 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  4 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  4 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  2 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  2 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  1 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  1 ) );
    return x;
 }
@ -111,10 +111,10 @@ DoNondiagonalTile_GPU(
        float fx, fy, fz;
        float4 bodyPosMass;
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
        bodyBodyInteraction<float>(
            &fx, &fy, &fz,
--- a/nbody/nbody_GPU_AOS_tiled_const.cuh
+++ b/nbody/nbody_GPU_AOS_tiled_const.cuh
@ -82,11 +82,11 @@ __device__
 warpReduce_const( float x )
 {
 #if __CUDA_ARCH__ && __CUDA_ARCH__ > 300
-    x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  8 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  8 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  4 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  4 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  2 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  2 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  1 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  1 ) );
 #endif
    return x;
 }
@ -117,10 +117,10 @@ DoNondiagonalTile_GPU_const(
        float fx, fy, fz;
        float4 bodyPosMass;
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
        bodyBodyInteraction<float>(
            &fx, &fy, &fz,
@ -200,10 +200,10 @@ DoNondiagonalTile_GPU_const(
        float fx, fy, fz;
        float4 bodyPosMass;
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
        bodyBodyInteraction<float>(
            &fx, &fy, &fz,
--- a/nbody/nbody_GPU_SOA_tiled.cuh
+++ b/nbody/nbody_GPU_SOA_tiled.cuh
@ -98,10 +98,10 @@ DoNondiagonalTile_GPU_SOA(
        float fx, fy, fz;
        float4 bodyPosMass;
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
        bodyBodyInteraction<float>(
            &fx, &fy, &fz,
--- a/nbody/nbody_GPU_Shuffle.cuh
+++ b/nbody/nbody_GPU_Shuffle.cuh
@ -56,10 +56,10 @@ ComputeNBodyGravitation_Shuffle(
                float fx, fy, fz;
                float4 shufDstPosMass;
-                shufDstPosMass.x = __shfl( shufSrcPosMass.x, k );
+                shufDstPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, k );
-                shufDstPosMass.y = __shfl( shufSrcPosMass.y, k );
+                shufDstPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, k );
-                shufDstPosMass.z = __shfl( shufSrcPosMass.z, k );
+                shufDstPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, k );
-                shufDstPosMass.w = __shfl( shufSrcPosMass.w, k );
+                shufDstPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, k );
                bodyBodyInteraction(
                    &fx, &fy, &fz, 
--- a/reduction/reduction.cu
+++ b/reduction/reduction.cu
@ -85,7 +85,7 @@ TimedReduction(
    cuda(Malloc( &partialSums, cBlocks*sizeof(int) ) );
    cuda(EventCreate( &start ) );
    cuda(EventCreate( &stop ) );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    cuda(EventRecord( start, 0 ) );
    hostReduction( 
@ -165,7 +165,7 @@ usPerInvocation( int cIterations, size_t N,
    for ( int i = 0; i < cIterations; i++ ) {
        pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    ret = chTimerElapsedTime( &start, &stop );
    ret = (ret / (double) cIterations) * 1e6;
--- a/reduction/reductionTemplated.cu
+++ b/reduction/reductionTemplated.cu
@ -129,7 +129,7 @@ TimedReduction(
    cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
    cuda(EventCreate( &start ) );
    cuda(EventCreate( &stop ) );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    cuda(EventRecord( start, 0 ) );
    hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads );
@ -209,7 +209,7 @@ usPerInvocation( int cIterations, size_t N,
    for ( int i = 0; i < cIterations; i++ ) {
        pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
    }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
    chTimerGetTime( &stop );
    ret = chTimerElapsedTime( &start, &stop );
    ret = (ret / (double) cIterations) * 1e6;
--- a/reduction/reductionWarpShuffle.cuh
+++ b/reduction/reductionWarpShuffle.cuh
@ -61,11 +61,11 @@ Reduction2_kernel( int *out, const int *in, size_t N )
        volatile int *wsSum = sPartials;
        if ( blockDim.x > 32 ) wsSum[tid] += wsSum[tid + 32];
        int mySum = wsSum[tid];
-        mySum += __shfl_xor( mySum, 16 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum, 16 );
-        mySum += __shfl_xor( mySum,  8 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  8 );
-        mySum += __shfl_xor( mySum,  4 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  4 );
-        mySum += __shfl_xor( mySum,  2 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  2 );
-        mySum += __shfl_xor( mySum,  1 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  1 );
 /*        wsSum[tid] += wsSum[tid + 16];
        wsSum[tid] += wsSum[tid + 8];
        wsSum[tid] += wsSum[tid + 4];
--- a/scan/int/timeScan.cu
+++ b/scan/int/timeScan.cu
@ -96,7 +96,7 @@ TimeScan( void (*pfnScanGPU)(T *, const T *, size_t, int),
    for ( int i = 0; i < cIterations; i++ ) {
        pfnScanGPU( outGPU, inGPU, N, numThreads );
    }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
        goto Error;
    chTimerGetTime( &stop );
--- a/scan/streamCompact/timeStreamCompact_odd.cu
+++ b/scan/streamCompact/timeStreamCompact_odd.cu
@ -122,7 +122,7 @@ TimeStreamCompact(
    for ( int i = 0; i < cIterations; i++ ) {
        pfnScanGPU( outGPU, deviceTotal, inGPU, N, numThreads );
    }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
        goto Error;
    chTimerGetTime( &stop );
--- a/scan/warp/scanWarpShuffle.cuh
+++ b/scan/warp/scanWarpShuffle.cuh
@ -69,7 +69,7 @@ exclusive_scan_warp_shfl(int mysum)
 {
    const unsigned int lane   = threadIdx.x & 31;
    for(int i = 0; i < levels; ++i)
-        mysum = shfl_scan_add_step(mysum, 1 << i);
+        mysum = scanWarpShuffle_step( mysum, 1 << i);
    mysum = __shfl_up(mysum, 1);
    return (lane) ? mysum : 0;
 }
--- a/scan/warp/testScanWarp.cu
+++ b/scan/warp/testScanWarp.cu
@ -63,7 +63,7 @@ enum ScanType {
 #include "scanReduceThenScan.cuh"
 #include "scanReduceThenScan_0.cuh"
 #include "scan2Level.cuh"
-#include "ScanThrust.cuh"
+#include "scanThrust.cuh"
 template<int period>
 void
--- a/texturing/surf2Dmemset.cu
+++ b/texturing/surf2Dmemset.cu
@ -184,7 +184,7 @@ CreateAndPrintTex(
    blocks.y = 1;
    threads.x = 64; threads.y = 4;
    TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int row = 0; row < outHeight; row++ ) {
        float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
--- a/texturing/tex1d_9bit.cu
+++ b/texturing/tex1d_9bit.cu
@ -98,7 +98,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
    tex.addressMode[0] = addressMode;
    cuda(HostGetDevicePointer(&outDevice, outHost, 0));
    TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    printf( "X\tY\tActual Value\tExpected Value\tDiff\n" );
    for ( int i = 0; i < outN; i++ ) {
--- a/texturing/tex1d_addressing.cu
+++ b/texturing/tex1d_addressing.cu
@ -169,7 +169,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
    tex.filterMode = filterMode;
    tex.addressMode[0] = addressMode;
    TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < outN; i++ ) {
        float x = base+(float)i*increment;
--- a/texturing/tex1d_unnormalized.cu
+++ b/texturing/tex1d_unnormalized.cu
@ -112,7 +112,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
    tex.addressMode[0] = addressMode;
    cuda(HostGetDevicePointer(&outDevice, outHost, 0));
    TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < outN; i++ ) {
        printf( "(%.2f, %.2f)\n", outHost[i].x, outHost[i].y );
--- a/texturing/tex1dfetch.cu
+++ b/texturing/tex1dfetch.cu
@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
    cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
    TexReadout<<<2,384>>>( device, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < N; i++ ) {
        printf( "%.2f ", host[i] );
    }
--- a/texturing/tex1dfetch_big.cu
+++ b/texturing/tex1dfetch_big.cu
@ -177,7 +177,7 @@ TexChecksum( int *out, int c, size_t N )
        default:
            goto Error;
    }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
        goto Error;
    *out = checksumGPU();
    ret = true;
--- a/texturing/tex1dfetch_host.cu
+++ b/texturing/tex1dfetch_host.cu
@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
    cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
    TexReadout<<<2,384>>>( device, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < N; i++ ) {
        printf( "%.2f ", host[i] );
    }
--- a/texturing/tex1dfetch_htod.cu
+++ b/texturing/tex1dfetch_htod.cu
@ -68,7 +68,7 @@ MeasureBandwidth( void *out, size_t N, int blocks, int threads )
    chTimerGetTime( &start );
    TexReadout<<<2,384>>>( (float *) out, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    chTimerGetTime( &stop );
--- a/texturing/tex1dfetch_int2float.cu
+++ b/texturing/tex1dfetch_int2float.cu
@ -119,7 +119,7 @@ CheckTexPromoteToFloat( size_t N )
                      cudaCreateChannelDesc<T>(), 
                      N*sizeof(T)));
    TexReadout<<<2,384>>>( foutDevice, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < N; i++ ) {
        printf( "%.2f ", foutHost[i] );
--- a/texturing/tex1dfetch_offset.cu
+++ b/texturing/tex1dfetch_offset.cu
@ -66,7 +66,7 @@ CheckTex( float *hostOut, const float *in, size_t offset, size_t N )
    cuda(HostGetDevicePointer( (void **) &deviceOut, hostOut, 0 ));
    TexReadout<<<2,384>>>( deviceOut, offset>>2, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int i = 0; i < N; i++ ) {
        if ( in[i] != hostOut[i] ) {
            printf( "Mismatch at index %d\n", i );
--- a/texturing/tex2d_addressing.cu
+++ b/texturing/tex2d_addressing.cu
@ -139,7 +139,7 @@ CreateAndPrintTex(
    blocks.y = 1;
    threads.x = 64; threads.y = 4;
    TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int row = 0; row < outHeight; row++ ) {
        float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
--- a/texturing/tex2d_addressing_device.cu
+++ b/texturing/tex2d_addressing_device.cu
@ -141,7 +141,7 @@ CreateAndPrintTex(
    blocks.y = 1;
    threads.x = 64; threads.y = 4;
    TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int row = 0; row < outHeight; row++ ) {
        float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
--- a/texturing/tex2d_memset.cu
+++ b/texturing/tex2d_memset.cu
@ -157,7 +157,7 @@ CreateAndPrintTex(
    blocks.y = 1;
    threads.x = 64; threads.y = 4;
    TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
    for ( int row = 0; row < outHeight; row++ ) {
        float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
--- a/texturing/tex2d_opengl.cu
+++ b/texturing/tex2d_opengl.cu
@ -189,7 +189,7 @@ void displayCB(void)		/* function called whenever redisplay needed */
    else {
        RenderTextureUnnormalized<<<g_height, 384>>>( g_deviceFrameBuffer, g_width, g_height );
    }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
        return;
    glRasterPos2f( 0.0f, 0.0f );
    glDrawPixels( g_width, g_height, GL_RGBA, GL_UNSIGNED_BYTE, g_hostFrameBuffer );