diff --git a/SMs/divergence.cu b/SMs/divergence.cu
index f71a633..fcda9b7 100644
--- a/SMs/divergence.cu
+++ b/SMs/divergence.cu
@@ -4,7 +4,7 @@
  *
  * Microdemo to measure performance implications of conditional code.
  *
- * Build with: nvcc [--gpu-architecture sm_xx] [-D USE_FLOAT] [-D USE_IF_STATEMENT] divergence.cu
+ * Build with: nvcc [--gpu-architecture sm_xx] divergence.cu
  * Requires: No minimum SM requirement.
  *
  * Copyright (c) 2021, Archaea Software, LLC.
@@ -46,66 +46,79 @@
 // apply blockDim and/or gridDim to n before passing in.
 //
 template<int base>
-void __device__ sumFloats( float *p, size_t N, size_t n )
+void __device__ sumInts( uint32_t *p, size_t N, size_t n )
 {
-    float f = base;
     for ( size_t i = 0; i < N; i++ ) {
-        *p += f;
+        *p += base;
         p += n;
     }
 }
 
-typedef void(*psumFloats)(float *, size_t, size_t);
+typedef void(*psumInts)(uint32_t *, size_t, size_t);
 
-__device__ psumFloats rgSumFloats[] = {
-    sumFloats< 0>, sumFloats< 1>, sumFloats< 2>, sumFloats< 3>,
-    sumFloats< 4>, sumFloats< 5>, sumFloats< 6>, sumFloats< 7>,
-    sumFloats< 8>, sumFloats< 9>, sumFloats<10>, sumFloats<11>,
-    sumFloats<12>, sumFloats<13>, sumFloats<14>, sumFloats<15>,
-    sumFloats<16>, sumFloats<17>, sumFloats<18>, sumFloats<19>,
-    sumFloats<20>, sumFloats<21>, sumFloats<22>, sumFloats<23>,
-    sumFloats<24>, sumFloats<25>, sumFloats<26>, sumFloats<27>,
-    sumFloats<28>, sumFloats<29>, sumFloats<30>, sumFloats<31> };
+__device__ psumInts rgSumInts[] = {
+    sumInts< 0>, sumInts< 1>, sumInts< 2>, sumInts< 3>,
+    sumInts< 4>, sumInts< 5>, sumInts< 6>, sumInts< 7>,
+    sumInts< 8>, sumInts< 9>, sumInts<10>, sumInts<11>,
+    sumInts<12>, sumInts<13>, sumInts<14>, sumInts<15>,
+    sumInts<16>, sumInts<17>, sumInts<18>, sumInts<19>,
+    sumInts<20>, sumInts<21>, sumInts<22>, sumInts<23>,
+    sumInts<24>, sumInts<25>, sumInts<26>, sumInts<27>,
+    sumInts<28>, sumInts<29>, sumInts<30>, sumInts<31> };
 
+template<uint32_t sh>
 __global__ void
-sumFloats_bywarp( float *p, size_t N )
+sumInts_bythread( uint32_t *p, size_t N )
 {
-    uint32_t warpid = threadIdx.x>>5;
+    uint32_t warpish_id = threadIdx.x>>sh;
     N /= blockDim.x*gridDim.x;
-    rgSumFloats[warpid]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
+    rgSumInts[warpish_id&31]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
 }
 
-__global__ void
-sumFloats_bythread( float *p, size_t N )
-{
-    
-}
-
-int
-main()
+template<uint32_t sh>
+static double
+timeByThreads( uint32_t *p, size_t N )
 {
     cudaError_t status;
-    size_t N = 1024*1024*1024UL;
-    float *p = 0;
-    float et;
+    float elapsed_time;
+    double ret = 0.0;
     cudaEvent_t start = 0, stop = 0;
 
-    cuda(Malloc( (void **) &p, N*sizeof(float)) );
-    cuda(Memset( p, 0, N*sizeof(float)) );
     cuda(EventCreate( &start ));
     cuda(EventCreate( &stop ));
 
     cuda(EventRecord( start ));
-    sumFloats_bywarp<<<3072,256>>>( p, N );
+    sumInts_bythread<sh><<<3072,1024>>>( p, N );
     cuda(EventRecord( stop ));
     cuda(DeviceSynchronize());
-    cuda(EventElapsedTime( &et, start, stop ));
-    
-    printf( "%.2f ms = %.2f Gops/s\n", et, (double) N*1000.0/et/1e9 );
-
-    cudaFree( p );
+    cuda(EventElapsedTime( &elapsed_time, start, stop ));
+    ret = N*1000.0/elapsed_time/1e9;
+    printf( "%2d threads: %f Gops/s\n", 1<<sh, ret );
+Error:
     cudaEventDestroy( stop );
     cudaEventDestroy( start );
+    return ret;
+}
+
+int
+main()
+{
+    cudaError_t status;
+    size_t N = 1024*1024*1024UL;
+    uint32_t *p = 0;
+
+    cuda(Malloc( (void **) &p, N*sizeof(uint32_t)) );
+    cuda(Memset( p, 0, N*sizeof(uint32_t)) );
+
+    timeByThreads<6>( p, N );
+    timeByThreads<5>( p, N );
+    timeByThreads<4>( p, N );
+    timeByThreads<3>( p, N );
+    timeByThreads<2>( p, N );
+    timeByThreads<1>( p, N );
+    timeByThreads<0>( p, N );
+
+    cudaFree( p );
     return 0;
 Error:
     return 1;
diff --git a/SMs/testShuffle.cu b/SMs/testShuffle.cu
index 3b719a6..b1f329f 100644
--- a/SMs/testShuffle.cu
+++ b/SMs/testShuffle.cu
@@ -48,7 +48,7 @@ TestShuffle( int *out, const int *in, size_t N  )
     size_t i = blockIdx.x*blockDim.x+threadIdx.x;
 
     int value = (int) i;//in[i];
-    out[i] = __shfl_up( value, 1 );
+    out[i] = __shfl_up_sync( 0xffffffff, value, 1 );
 }
 
 cudaError_t
diff --git a/chLib/chError.h b/chLib/chError.h
index 53553ad..b332dad 100644
--- a/chLib/chError.h
+++ b/chLib/chError.h
@@ -112,6 +112,40 @@ chGetErrorString( CUresult status )
         ErrorValue(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
         ErrorValue(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED)
 #endif
+        ErrorValue(CUDA_ERROR_STUB_LIBRARY)
+        ErrorValue(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
+        ErrorValue(CUDA_ERROR_DEVICE_NOT_LICENSED)
+        ErrorValue(CUDA_ERROR_INVALID_PTX)
+        ErrorValue(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT)
+        ErrorValue(CUDA_ERROR_NVLINK_UNCORRECTABLE)
+        ErrorValue(CUDA_ERROR_JIT_COMPILER_NOT_FOUND)
+        ErrorValue(CUDA_ERROR_JIT_COMPILATION_DISABLED)
+        ErrorValue(CUDA_ERROR_UNSUPPORTED_PTX_VERSION)
+        ErrorValue(CUDA_ERROR_ILLEGAL_STATE)
+        ErrorValue(CUDA_ERROR_ILLEGAL_ADDRESS)
+        ErrorValue(CUDA_ERROR_HARDWARE_STACK_ERROR)
+        ErrorValue(CUDA_ERROR_ILLEGAL_INSTRUCTION)
+        ErrorValue(CUDA_ERROR_MISALIGNED_ADDRESS)
+        ErrorValue(CUDA_ERROR_INVALID_ADDRESS_SPACE)
+        ErrorValue(CUDA_ERROR_INVALID_PC)
+        ErrorValue(CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
+        ErrorValue(CUDA_ERROR_NOT_PERMITTED)
+        ErrorValue(CUDA_ERROR_NOT_SUPPORTED)
+        ErrorValue(CUDA_ERROR_SYSTEM_NOT_READY)
+        ErrorValue(CUDA_ERROR_SYSTEM_DRIVER_MISMATCH)
+        ErrorValue(CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_INVALIDATED)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_MERGE)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNMATCHED)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNJOINED)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_ISOLATION)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_IMPLICIT)
+        ErrorValue(CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD)
+        ErrorValue(CUDA_ERROR_TIMEOUT)
+        ErrorValue(CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE)
+        ErrorValue(CUDA_ERROR_CAPTURED_EVENT)
+
         ErrorValue(CUDA_ERROR_UNKNOWN)
     }
     return "chGetErrorString - unknown error value";
diff --git a/concurrency/breakevenDtoHMemcpy.cu b/concurrency/breakevenDtoHMemcpy.cu
index 5d2dc0b..404ed5d 100644
--- a/concurrency/breakevenDtoHMemcpy.cu
+++ b/concurrency/breakevenDtoHMemcpy.cu
@@ -70,7 +70,7 @@ main( int argc, char *argv[] )
             cuda(MemcpyAsync( hostInt, deviceInt, byteCount, 
                 cudaMemcpyDeviceToHost, NULL ) );
         }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
         chTimerGetTime( &stop );
 
         {
diff --git a/concurrency/breakevenHtoDMemcpy.cu b/concurrency/breakevenHtoDMemcpy.cu
index 707683e..0ec1dfa 100644
--- a/concurrency/breakevenHtoDMemcpy.cu
+++ b/concurrency/breakevenHtoDMemcpy.cu
@@ -70,7 +70,7 @@ main( int argc, char *argv[] )
             cuda(MemcpyAsync( deviceInt, hostInt, byteCount, 
                 cudaMemcpyHostToDevice, NULL ) );
         }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
         chTimerGetTime( &stop );
 
         {
diff --git a/concurrency/breakevenKernelAsync.cu b/concurrency/breakevenKernelAsync.cu
index 763b22b..6067453 100644
--- a/concurrency/breakevenKernelAsync.cu
+++ b/concurrency/breakevenKernelAsync.cu
@@ -67,7 +67,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         WaitKernel<<<1,1>>>( 0, false );
     }
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
 
     printf("Cycles\tus\n" );
     for ( int cycles = 0; cycles < 2500; cycles += 100 ) {
@@ -76,7 +76,7 @@ main( int argc, char *argv[] )
         for ( int i = 0; i < cIterations; i++ ) {
             WaitKernel<<<1,1>>>( cycles, false );
         }
-        cudaThreadSynchronize();
+        cudaDeviceSynchronize();
         chTimerGetTime( &stop );
         double microseconds = 1e6*chTimerElapsedTime( &start, &stop );
         double usPerLaunch = microseconds / (float) cIterations;
diff --git a/concurrency/eventRecord.cu b/concurrency/eventRecord.cu
index 689d6b7..91600fc 100644
--- a/concurrency/eventRecord.cu
+++ b/concurrency/eventRecord.cu
@@ -72,7 +72,7 @@ usPerLaunch( int cIterations, int cEvents )
             cuda(EventRecord( events[j], NULL ) );
         }
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     microseconds = 1e6*chTimerElapsedTime( &start, &stop );
diff --git a/concurrency/managedOverhead.cu b/concurrency/managedOverhead.cu
index 7f0f341..517e0d9 100644
--- a/concurrency/managedOverhead.cu
+++ b/concurrency/managedOverhead.cu
@@ -73,7 +73,7 @@ usPerLaunch( int cIterations, size_t cPages=0 )
     chTimerGetTime( &start );
     for ( int i = 0; i < cIterations; i++ ) {
         NullKernel<<<1,1>>>();
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
         if ( bTouch && 0 != p ) {
             for ( int iPage = 0; iPage < cPages; iPage++ ) {
                 ((volatile unsigned char *) p)[iPage*pageSize] |= 1;
diff --git a/concurrency/nullDtoHMemcpyAsync.cu b/concurrency/nullDtoHMemcpyAsync.cu
index 92d8fc9..7153c22 100644
--- a/concurrency/nullDtoHMemcpyAsync.cu
+++ b/concurrency/nullDtoHMemcpyAsync.cu
@@ -60,7 +60,7 @@ main( int argc, char *argv[] )
         cuda(MemcpyAsync( hostInt, deviceInt, sizeof(int), 
             cudaMemcpyDeviceToHost, NULL ) );
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/nullDtoHMemcpySync.cu b/concurrency/nullDtoHMemcpySync.cu
index e3c3812..6bbd360 100644
--- a/concurrency/nullDtoHMemcpySync.cu
+++ b/concurrency/nullDtoHMemcpySync.cu
@@ -61,7 +61,7 @@ main( int argc, char *argv[] )
             cuda(Memcpy( hostInt, deviceInt, sizeof(int), 
                 cudaMemcpyDeviceToHost ) );
         }
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
         chTimerGetTime( &stop );
         cIterations *= 2;
     } while ( chTimerElapsedTime( &start, &stop ) < 0.5f ) ;
diff --git a/concurrency/nullHtoDMemcpyAsync.cu b/concurrency/nullHtoDMemcpyAsync.cu
index d0a8cd2..c90e4bc 100644
--- a/concurrency/nullHtoDMemcpyAsync.cu
+++ b/concurrency/nullHtoDMemcpyAsync.cu
@@ -60,7 +60,7 @@ main( int argc, char *argv[] )
         cuda(MemcpyAsync( deviceInt, hostInt, sizeof(int), 
             cudaMemcpyHostToDevice, NULL ) );
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/nullKernelAsync.cu b/concurrency/nullKernelAsync.cu
index 0003fb6..0d3ca09 100644
--- a/concurrency/nullKernelAsync.cu
+++ b/concurrency/nullKernelAsync.cu
@@ -60,7 +60,7 @@ usPerLaunch( int cIterations )
     for ( int i = 0; i < cIterations; i++ ) {
         NullKernel<<<1,1>>>();
     }
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
     chTimerGetTime( &stop );
 
     microseconds = 1e6*chTimerElapsedTime( &start, &stop );
diff --git a/concurrency/nullKernelSync.cu b/concurrency/nullKernelSync.cu
index 0b0b4dc..6b1c3c8 100644
--- a/concurrency/nullKernelSync.cu
+++ b/concurrency/nullKernelSync.cu
@@ -59,7 +59,7 @@ usPerLaunch( int cIterations )
     chTimerGetTime( &start );
     for ( int i = 0; i < cIterations; i++ ) {
         NullKernel<<<1,1>>>();
-        cuda(ThreadSynchronize() );
+        cuda(DeviceSynchronize() );
     }
     chTimerGetTime( &stop );
 
diff --git a/concurrency/pageableMemcpyHtoD.cu b/concurrency/pageableMemcpyHtoD.cu
index dc94cca..a2f03da 100644
--- a/concurrency/pageableMemcpyHtoD.cu
+++ b/concurrency/pageableMemcpyHtoD.cu
@@ -145,7 +145,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/pageableMemcpyHtoD16.cu b/concurrency/pageableMemcpyHtoD16.cu
index a5059eb..b0c1f51 100644
--- a/concurrency/pageableMemcpyHtoD16.cu
+++ b/concurrency/pageableMemcpyHtoD16.cu
@@ -157,7 +157,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/pageableMemcpyHtoD16Blocking.cu b/concurrency/pageableMemcpyHtoD16Blocking.cu
index d24dd29..f3973f4 100644
--- a/concurrency/pageableMemcpyHtoD16Blocking.cu
+++ b/concurrency/pageableMemcpyHtoD16Blocking.cu
@@ -156,7 +156,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/pageableMemcpyHtoD16Broken.cu b/concurrency/pageableMemcpyHtoD16Broken.cu
index 9cd53b3..643d7a8 100644
--- a/concurrency/pageableMemcpyHtoD16Broken.cu
+++ b/concurrency/pageableMemcpyHtoD16Broken.cu
@@ -157,7 +157,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/pageableMemcpyHtoD16Synchronous.cu b/concurrency/pageableMemcpyHtoD16Synchronous.cu
index fe09c5e..49a2441 100644
--- a/concurrency/pageableMemcpyHtoD16Synchronous.cu
+++ b/concurrency/pageableMemcpyHtoD16Synchronous.cu
@@ -157,7 +157,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/concurrency/peer2peerMemcpy.cu b/concurrency/peer2peerMemcpy.cu
index 5d33d08..826f3b9 100644
--- a/concurrency/peer2peerMemcpy.cu
+++ b/concurrency/peer2peerMemcpy.cu
@@ -195,7 +195,7 @@ main( int argc, char *argv[] )
     for ( int i = 0; i < cIterations; i++ ) {
         chMemcpyPeerToPeer( deviceInt[0], 0, deviceInt[1], 1, numInts*sizeof(int) ) ;
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
 
     {
diff --git a/corr/normalizedCrossCorrelation.cu b/corr/normalizedCrossCorrelation.cu
index eea6511..1413bf7 100644
--- a/corr/normalizedCrossCorrelation.cu
+++ b/corr/normalizedCrossCorrelation.cu
@@ -476,7 +476,8 @@ main(int argc, char *argv[])
 
     int sharedPitch;
     int sharedMem;
-    char *inputFilename = "coins.pgm";
+    char defaultInputFilename[] = "coins.pgm";
+    char *inputFilename = defaultInputFilename;
     char *outputFilename = NULL;
 
     cudaArray *pArrayImage = NULL;
diff --git a/histogram/histogram.cu b/histogram/histogram.cu
index 6234189..d0a3167 100644
--- a/histogram/histogram.cu
+++ b/histogram/histogram.cu
@@ -261,7 +261,8 @@ main(int argc, char *argv[])
 
     dim3 threads;
 
-    char *inputFilename = "coins.pgm";
+    char defaultInputFilename[] = "coins.pgm";
+    char *inputFilename = defaultInputFilename;
     char *outputFilename = NULL;
 
     cudaArray *pArrayImage = NULL;
diff --git a/memory/globalCopy.cu b/memory/globalCopy.cu
index 06d63b3..7131380 100644
--- a/memory/globalCopy.cu
+++ b/memory/globalCopy.cu
@@ -114,7 +114,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn,
         GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/memory/globalCopy2.cu b/memory/globalCopy2.cu
index 9cf1199..7908949 100644
--- a/memory/globalCopy2.cu
+++ b/memory/globalCopy2.cu
@@ -160,7 +160,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn0, T *deviceIn1,
         GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn0+bOffsetSrc, deviceIn1+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/memory/globalRead.cu b/memory/globalRead.cu
index f191953..b2aad68 100644
--- a/memory/globalRead.cu
+++ b/memory/globalRead.cu
@@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
         GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/memory/globalReadTex.cu b/memory/globalReadTex.cu
index 3829156..88cf64c 100644
--- a/memory/globalReadTex.cu
+++ b/memory/globalReadTex.cu
@@ -236,7 +236,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
         GlobalReads<T,n><<<cBlocks,cThreads>>>( out, bOffset, N-bOffset, false );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/memory/globalWrite.cu b/memory/globalWrite.cu
index be88461..f71cbc4 100644
--- a/memory/globalWrite.cu
+++ b/memory/globalWrite.cu
@@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
         GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/memory/spinlockReduction.cu b/memory/spinlockReduction.cu
index fd0a3fd..e08395b 100644
--- a/memory/spinlockReduction.cu
+++ b/memory/spinlockReduction.cu
@@ -208,7 +208,7 @@ AtomicsPerSecond( size_t N, int cBlocks, int cThreads )
     }
 
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
 
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
diff --git a/microbench/globalRead.cu b/microbench/globalRead.cu
index 7041b92..2049814 100644
--- a/microbench/globalRead.cu
+++ b/microbench/globalRead.cu
@@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
         GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/microbench/globalWrite.cu b/microbench/globalWrite.cu
index d63cc18..ca19c31 100644
--- a/microbench/globalWrite.cu
+++ b/microbench/globalWrite.cu
@@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
         GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
     }
     cudaEventRecord( evStop );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     // make configurations that cannot launch error-out with 0 bandwidth
     cuda(GetLastError() ); 
     cuda(EventElapsedTime( &ms, evStart, evStop ) );
diff --git a/microbench/reportClocks.cu b/microbench/reportClocks.cu
index e9598a2..20ab911 100644
--- a/microbench/reportClocks.cu
+++ b/microbench/reportClocks.cu
@@ -97,13 +97,13 @@ ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSi
     cuda(EventCreate( &stop ) );
 
     WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
 
     cuda(EventRecord( start, 0 ) );
     WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
     cuda(EventRecord( stop, 0 ) );
 
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
 
     {
         float ms;
diff --git a/nbody/nbody.cu b/nbody/nbody.cu
index 7c1a9e0..56bc70b 100644
--- a/nbody/nbody.cu
+++ b/nbody/nbody.cu
@@ -7,8 +7,8 @@
  * parallelizable, with lots of FLOPS per unit of external 
  * memory bandwidth required.
  *
- * Build with: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
- *   On Linux: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lpthread -lrt
+ * Build with: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
+ *   On Linux: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lcudart_static -ldl -lrt
  * Requires: No minimum SM requirement.  If SM 3.x is not available,
  * this application quietly replaces the shuffle and fast-atomic
  * implementations with the shared memory implementation.
@@ -482,7 +482,7 @@ ComputeGravitation(
             sumY += g_hostAOS_Force[i*3+1];
             sumZ += g_hostAOS_Force[i*3+2];
         }
-        *maxRelError = max( fabs(sumX), max(fabs(sumY), fabs(sumZ)) );
+        *maxRelError = std::max( fabs(sumX), std::max(fabs(sumY), fabs(sumZ)) );
         if ( g_ZeroThreshold != 0.0 && 
              fabs( *maxRelError ) > g_ZeroThreshold ) {
             printf( "Maximum sum of forces > threshold (%E > %E)\n",
diff --git a/nbody/nbody_GPU_AOS_tiled.cuh b/nbody/nbody_GPU_AOS_tiled.cuh
index 66d7c75..78b2ec3 100644
--- a/nbody/nbody_GPU_AOS_tiled.cuh
+++ b/nbody/nbody_GPU_AOS_tiled.cuh
@@ -78,11 +78,11 @@ inline float
 __device__
 warpReduce( float x )
 {
-    x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  8 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  4 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  2 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  1 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  8 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  4 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  2 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  1 ) );
     return x;
 }
 
@@ -111,10 +111,10 @@ DoNondiagonalTile_GPU(
         float fx, fy, fz;
         float4 bodyPosMass;
 
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
 
         bodyBodyInteraction<float>(
             &fx, &fy, &fz,
diff --git a/nbody/nbody_GPU_AOS_tiled_const.cuh b/nbody/nbody_GPU_AOS_tiled_const.cuh
index 9295d94..ef31c6e 100644
--- a/nbody/nbody_GPU_AOS_tiled_const.cuh
+++ b/nbody/nbody_GPU_AOS_tiled_const.cuh
@@ -82,11 +82,11 @@ __device__
 warpReduce_const( float x )
 {
 #if __CUDA_ARCH__ && __CUDA_ARCH__ > 300
-    x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  8 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  4 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  2 ) );
-    x += __int_as_float( __shfl_xor( __float_as_int(x),  1 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  8 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  4 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  2 ) );
+    x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x),  1 ) );
 #endif
     return x;
 }
@@ -117,10 +117,10 @@ DoNondiagonalTile_GPU_const(
         float fx, fy, fz;
         float4 bodyPosMass;
 
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
 
         bodyBodyInteraction<float>(
             &fx, &fy, &fz,
@@ -200,10 +200,10 @@ DoNondiagonalTile_GPU_const(
         float fx, fy, fz;
         float4 bodyPosMass;
 
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
 
         bodyBodyInteraction<float>(
             &fx, &fy, &fz,
diff --git a/nbody/nbody_GPU_SOA_tiled.cuh b/nbody/nbody_GPU_SOA_tiled.cuh
index 1287a9d..959b3c2 100644
--- a/nbody/nbody_GPU_SOA_tiled.cuh
+++ b/nbody/nbody_GPU_SOA_tiled.cuh
@@ -98,10 +98,10 @@ DoNondiagonalTile_GPU_SOA(
         float fx, fy, fz;
         float4 bodyPosMass;
 
-        bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
-        bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
-        bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
-        bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
+        bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
+        bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
+        bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
+        bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
 
         bodyBodyInteraction<float>(
             &fx, &fy, &fz,
diff --git a/nbody/nbody_GPU_Shuffle.cuh b/nbody/nbody_GPU_Shuffle.cuh
index 787b423..4e87f8c 100644
--- a/nbody/nbody_GPU_Shuffle.cuh
+++ b/nbody/nbody_GPU_Shuffle.cuh
@@ -56,10 +56,10 @@ ComputeNBodyGravitation_Shuffle(
                 float fx, fy, fz;
                 float4 shufDstPosMass;
 
-                shufDstPosMass.x = __shfl( shufSrcPosMass.x, k );
-                shufDstPosMass.y = __shfl( shufSrcPosMass.y, k );
-                shufDstPosMass.z = __shfl( shufSrcPosMass.z, k );
-                shufDstPosMass.w = __shfl( shufSrcPosMass.w, k );
+                shufDstPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, k );
+                shufDstPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, k );
+                shufDstPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, k );
+                shufDstPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, k );
 
                 bodyBodyInteraction(
                     &fx, &fy, &fz, 
diff --git a/reduction/reduction.cu b/reduction/reduction.cu
index c8d9802..aa7605b 100644
--- a/reduction/reduction.cu
+++ b/reduction/reduction.cu
@@ -85,7 +85,7 @@ TimedReduction(
     cuda(Malloc( &partialSums, cBlocks*sizeof(int) ) );
     cuda(EventCreate( &start ) );
     cuda(EventCreate( &stop ) );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
 
     cuda(EventRecord( start, 0 ) );
     hostReduction( 
@@ -165,7 +165,7 @@ usPerInvocation( int cIterations, size_t N,
     for ( int i = 0; i < cIterations; i++ ) {
         pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
     ret = chTimerElapsedTime( &start, &stop );
     ret = (ret / (double) cIterations) * 1e6;
diff --git a/reduction/reductionTemplated.cu b/reduction/reductionTemplated.cu
index 8be6a23..285bf98 100644
--- a/reduction/reductionTemplated.cu
+++ b/reduction/reductionTemplated.cu
@@ -129,7 +129,7 @@ TimedReduction(
     cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
     cuda(EventCreate( &start ) );
     cuda(EventCreate( &stop ) );
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
 
     cuda(EventRecord( start, 0 ) );
     hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads );
@@ -209,7 +209,7 @@ usPerInvocation( int cIterations, size_t N,
     for ( int i = 0; i < cIterations; i++ ) {
         pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
     }
-    cuda(ThreadSynchronize() );
+    cuda(DeviceSynchronize() );
     chTimerGetTime( &stop );
     ret = chTimerElapsedTime( &start, &stop );
     ret = (ret / (double) cIterations) * 1e6;
diff --git a/reduction/reductionWarpShuffle.cuh b/reduction/reductionWarpShuffle.cuh
index 969e595..72cd018 100644
--- a/reduction/reductionWarpShuffle.cuh
+++ b/reduction/reductionWarpShuffle.cuh
@@ -61,11 +61,11 @@ Reduction2_kernel( int *out, const int *in, size_t N )
         volatile int *wsSum = sPartials;
         if ( blockDim.x > 32 ) wsSum[tid] += wsSum[tid + 32];
         int mySum = wsSum[tid];
-        mySum += __shfl_xor( mySum, 16 );
-        mySum += __shfl_xor( mySum,  8 );
-        mySum += __shfl_xor( mySum,  4 );
-        mySum += __shfl_xor( mySum,  2 );
-        mySum += __shfl_xor( mySum,  1 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum, 16 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  8 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  4 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  2 );
+        mySum += __shfl_xor_sync( 0xffffffff, mySum,  1 );
 /*        wsSum[tid] += wsSum[tid + 16];
         wsSum[tid] += wsSum[tid + 8];
         wsSum[tid] += wsSum[tid + 4];
diff --git a/scan/int/timeScan.cu b/scan/int/timeScan.cu
index 81cf9fa..a82d06c 100644
--- a/scan/int/timeScan.cu
+++ b/scan/int/timeScan.cu
@@ -96,7 +96,7 @@ TimeScan( void (*pfnScanGPU)(T *, const T *, size_t, int),
     for ( int i = 0; i < cIterations; i++ ) {
         pfnScanGPU( outGPU, inGPU, N, numThreads );
     }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
         goto Error;
     chTimerGetTime( &stop );
 
diff --git a/scan/streamCompact/timeStreamCompact_odd.cu b/scan/streamCompact/timeStreamCompact_odd.cu
index 8d8a1ea..dafd0b1 100644
--- a/scan/streamCompact/timeStreamCompact_odd.cu
+++ b/scan/streamCompact/timeStreamCompact_odd.cu
@@ -122,7 +122,7 @@ TimeStreamCompact(
     for ( int i = 0; i < cIterations; i++ ) {
         pfnScanGPU( outGPU, deviceTotal, inGPU, N, numThreads );
     }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
         goto Error;
     chTimerGetTime( &stop );
 
diff --git a/scan/warp/scanWarpShuffle.cuh b/scan/warp/scanWarpShuffle.cuh
index 4573ac4..5521465 100644
--- a/scan/warp/scanWarpShuffle.cuh
+++ b/scan/warp/scanWarpShuffle.cuh
@@ -69,7 +69,7 @@ exclusive_scan_warp_shfl(int mysum)
 {
     const unsigned int lane   = threadIdx.x & 31;
     for(int i = 0; i < levels; ++i)
-        mysum = shfl_scan_add_step(mysum, 1 << i);
+        mysum = scanWarpShuffle_step( mysum, 1 << i);
     mysum = __shfl_up(mysum, 1);
     return (lane) ? mysum : 0;
 }
diff --git a/scan/warp/testScanWarp.cu b/scan/warp/testScanWarp.cu
index 34865b7..11f5706 100644
--- a/scan/warp/testScanWarp.cu
+++ b/scan/warp/testScanWarp.cu
@@ -63,7 +63,7 @@ enum ScanType {
 #include "scanReduceThenScan.cuh"
 #include "scanReduceThenScan_0.cuh"
 #include "scan2Level.cuh"
-#include "ScanThrust.cuh"
+#include "scanThrust.cuh"
 
 template<int period>
 void
diff --git a/texturing/surf2Dmemset.cu b/texturing/surf2Dmemset.cu
index eb62aa1..6f0fa60 100644
--- a/texturing/surf2Dmemset.cu
+++ b/texturing/surf2Dmemset.cu
@@ -184,7 +184,7 @@ CreateAndPrintTex(
     blocks.y = 1;
     threads.x = 64; threads.y = 4;
     TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int row = 0; row < outHeight; row++ ) {
         float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
diff --git a/texturing/tex1d_9bit.cu b/texturing/tex1d_9bit.cu
index a4bb5f2..df339d4 100644
--- a/texturing/tex1d_9bit.cu
+++ b/texturing/tex1d_9bit.cu
@@ -98,7 +98,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
     tex.addressMode[0] = addressMode;
     cuda(HostGetDevicePointer(&outDevice, outHost, 0));
     TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     printf( "X\tY\tActual Value\tExpected Value\tDiff\n" );
     for ( int i = 0; i < outN; i++ ) {
diff --git a/texturing/tex1d_addressing.cu b/texturing/tex1d_addressing.cu
index 49d28f1..d3e2e0c 100644
--- a/texturing/tex1d_addressing.cu
+++ b/texturing/tex1d_addressing.cu
@@ -169,7 +169,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
     tex.filterMode = filterMode;
     tex.addressMode[0] = addressMode;
     TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int i = 0; i < outN; i++ ) {
         float x = base+(float)i*increment;
diff --git a/texturing/tex1d_unnormalized.cu b/texturing/tex1d_unnormalized.cu
index 98fbb07..61ccaa7 100644
--- a/texturing/tex1d_unnormalized.cu
+++ b/texturing/tex1d_unnormalized.cu
@@ -112,7 +112,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
     tex.addressMode[0] = addressMode;
     cuda(HostGetDevicePointer(&outDevice, outHost, 0));
     TexReadout<<<2,384>>>( outDevice, outN, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int i = 0; i < outN; i++ ) {
         printf( "(%.2f, %.2f)\n", outHost[i].x, outHost[i].y );
diff --git a/texturing/tex1dfetch.cu b/texturing/tex1dfetch.cu
index 25cd590..9e2627e 100644
--- a/texturing/tex1dfetch.cu
+++ b/texturing/tex1dfetch.cu
@@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
     cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
     
     TexReadout<<<2,384>>>( device, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
     for ( int i = 0; i < N; i++ ) {
         printf( "%.2f ", host[i] );
     }
diff --git a/texturing/tex1dfetch_big.cu b/texturing/tex1dfetch_big.cu
index a9990c8..1b4364c 100644
--- a/texturing/tex1dfetch_big.cu
+++ b/texturing/tex1dfetch_big.cu
@@ -177,7 +177,7 @@ TexChecksum( int *out, int c, size_t N )
         default:
             goto Error;
     }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
         goto Error;
     *out = checksumGPU();
     ret = true;
diff --git a/texturing/tex1dfetch_host.cu b/texturing/tex1dfetch_host.cu
index 488fe61..7279f05 100644
--- a/texturing/tex1dfetch_host.cu
+++ b/texturing/tex1dfetch_host.cu
@@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
     cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
     
     TexReadout<<<2,384>>>( device, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
     for ( int i = 0; i < N; i++ ) {
         printf( "%.2f ", host[i] );
     }
diff --git a/texturing/tex1dfetch_htod.cu b/texturing/tex1dfetch_htod.cu
index 19703b8..d98d508 100644
--- a/texturing/tex1dfetch_htod.cu
+++ b/texturing/tex1dfetch_htod.cu
@@ -68,7 +68,7 @@ MeasureBandwidth( void *out, size_t N, int blocks, int threads )
     chTimerGetTime( &start );
 
     TexReadout<<<2,384>>>( (float *) out, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     chTimerGetTime( &stop );
 
diff --git a/texturing/tex1dfetch_int2float.cu b/texturing/tex1dfetch_int2float.cu
index f80be04..32c61a0 100644
--- a/texturing/tex1dfetch_int2float.cu
+++ b/texturing/tex1dfetch_int2float.cu
@@ -119,7 +119,7 @@ CheckTexPromoteToFloat( size_t N )
                       cudaCreateChannelDesc<T>(), 
                       N*sizeof(T)));
     TexReadout<<<2,384>>>( foutDevice, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int i = 0; i < N; i++ ) {
         printf( "%.2f ", foutHost[i] );
diff --git a/texturing/tex1dfetch_offset.cu b/texturing/tex1dfetch_offset.cu
index 9c00f1b..f2e7223 100644
--- a/texturing/tex1dfetch_offset.cu
+++ b/texturing/tex1dfetch_offset.cu
@@ -66,7 +66,7 @@ CheckTex( float *hostOut, const float *in, size_t offset, size_t N )
     cuda(HostGetDevicePointer( (void **) &deviceOut, hostOut, 0 ));
     
     TexReadout<<<2,384>>>( deviceOut, offset>>2, N );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
     for ( int i = 0; i < N; i++ ) {
         if ( in[i] != hostOut[i] ) {
             printf( "Mismatch at index %d\n", i );
diff --git a/texturing/tex2d_addressing.cu b/texturing/tex2d_addressing.cu
index 2d8feab..96f192d 100644
--- a/texturing/tex2d_addressing.cu
+++ b/texturing/tex2d_addressing.cu
@@ -139,7 +139,7 @@ CreateAndPrintTex(
     blocks.y = 1;
     threads.x = 64; threads.y = 4;
     TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int row = 0; row < outHeight; row++ ) {
         float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
diff --git a/texturing/tex2d_addressing_device.cu b/texturing/tex2d_addressing_device.cu
index 9f52f6f..b629b7d 100644
--- a/texturing/tex2d_addressing_device.cu
+++ b/texturing/tex2d_addressing_device.cu
@@ -141,7 +141,7 @@ CreateAndPrintTex(
     blocks.y = 1;
     threads.x = 64; threads.y = 4;
     TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int row = 0; row < outHeight; row++ ) {
         float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
diff --git a/texturing/tex2d_memset.cu b/texturing/tex2d_memset.cu
index 5ea168f..307a1aa 100644
--- a/texturing/tex2d_memset.cu
+++ b/texturing/tex2d_memset.cu
@@ -157,7 +157,7 @@ CreateAndPrintTex(
     blocks.y = 1;
     threads.x = 64; threads.y = 4;
     TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
-    cuda(ThreadSynchronize());
+    cuda(DeviceSynchronize());
 
     for ( int row = 0; row < outHeight; row++ ) {
         float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);
diff --git a/texturing/tex2d_opengl.cu b/texturing/tex2d_opengl.cu
index bbf2b06..0626c18 100644
--- a/texturing/tex2d_opengl.cu
+++ b/texturing/tex2d_opengl.cu
@@ -189,7 +189,7 @@ void displayCB(void)		/* function called whenever redisplay needed */
     else {
         RenderTextureUnnormalized<<<g_height, 384>>>( g_deviceFrameBuffer, g_width, g_height );
     }
-    if ( cudaSuccess != cudaThreadSynchronize() )
+    if ( cudaSuccess != cudaDeviceSynchronize() )
         return;
     glRasterPos2f( 0.0f, 0.0f );
     glDrawPixels( g_width, g_height, GL_RGBA, GL_UNSIGNED_BYTE, g_hostFrameBuffer );