diff --git a/SMs/divergence.cu b/SMs/divergence.cu index f71a633..fcda9b7 100644 --- a/SMs/divergence.cu +++ b/SMs/divergence.cu @@ -4,7 +4,7 @@ * * Microdemo to measure performance implications of conditional code. * - * Build with: nvcc [--gpu-architecture sm_xx] [-D USE_FLOAT] [-D USE_IF_STATEMENT] divergence.cu + * Build with: nvcc [--gpu-architecture sm_xx] divergence.cu * Requires: No minimum SM requirement. * * Copyright (c) 2021, Archaea Software, LLC. @@ -46,66 +46,79 @@ // apply blockDim and/or gridDim to n before passing in. // template -void __device__ sumFloats( float *p, size_t N, size_t n ) +void __device__ sumInts( uint32_t *p, size_t N, size_t n ) { - float f = base; for ( size_t i = 0; i < N; i++ ) { - *p += f; + *p += base; p += n; } } -typedef void(*psumFloats)(float *, size_t, size_t); +typedef void(*psumInts)(uint32_t *, size_t, size_t); -__device__ psumFloats rgSumFloats[] = { - sumFloats< 0>, sumFloats< 1>, sumFloats< 2>, sumFloats< 3>, - sumFloats< 4>, sumFloats< 5>, sumFloats< 6>, sumFloats< 7>, - sumFloats< 8>, sumFloats< 9>, sumFloats<10>, sumFloats<11>, - sumFloats<12>, sumFloats<13>, sumFloats<14>, sumFloats<15>, - sumFloats<16>, sumFloats<17>, sumFloats<18>, sumFloats<19>, - sumFloats<20>, sumFloats<21>, sumFloats<22>, sumFloats<23>, - sumFloats<24>, sumFloats<25>, sumFloats<26>, sumFloats<27>, - sumFloats<28>, sumFloats<29>, sumFloats<30>, sumFloats<31> }; +__device__ psumInts rgSumInts[] = { + sumInts< 0>, sumInts< 1>, sumInts< 2>, sumInts< 3>, + sumInts< 4>, sumInts< 5>, sumInts< 6>, sumInts< 7>, + sumInts< 8>, sumInts< 9>, sumInts<10>, sumInts<11>, + sumInts<12>, sumInts<13>, sumInts<14>, sumInts<15>, + sumInts<16>, sumInts<17>, sumInts<18>, sumInts<19>, + sumInts<20>, sumInts<21>, sumInts<22>, sumInts<23>, + sumInts<24>, sumInts<25>, sumInts<26>, sumInts<27>, + sumInts<28>, sumInts<29>, sumInts<30>, sumInts<31> }; +template __global__ void -sumFloats_bywarp( float *p, size_t N ) +sumInts_bythread( uint32_t *p, size_t N ) { - uint32_t warpid = threadIdx.x>>5; + uint32_t warpish_id = threadIdx.x>>sh; N /= blockDim.x*gridDim.x; - rgSumFloats[warpid]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x ); + rgSumInts[warpish_id&31]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x ); } -__global__ void -sumFloats_bythread( float *p, size_t N ) -{ - -} - -int -main() +template +static double +timeByThreads( uint32_t *p, size_t N ) { cudaError_t status; - size_t N = 1024*1024*1024UL; - float *p = 0; - float et; + float elapsed_time; + double ret = 0.0; cudaEvent_t start = 0, stop = 0; - cuda(Malloc( (void **) &p, N*sizeof(float)) ); - cuda(Memset( p, 0, N*sizeof(float)) ); cuda(EventCreate( &start )); cuda(EventCreate( &stop )); cuda(EventRecord( start )); - sumFloats_bywarp<<<3072,256>>>( p, N ); + sumInts_bythread<<<3072,1024>>>( p, N ); cuda(EventRecord( stop )); cuda(DeviceSynchronize()); - cuda(EventElapsedTime( &et, start, stop )); - - printf( "%.2f ms = %.2f Gops/s\n", et, (double) N*1000.0/et/1e9 ); - - cudaFree( p ); + cuda(EventElapsedTime( &elapsed_time, start, stop )); + ret = N*1000.0/elapsed_time/1e9; + printf( "%2d threads: %f Gops/s\n", 1<( p, N ); + timeByThreads<5>( p, N ); + timeByThreads<4>( p, N ); + timeByThreads<3>( p, N ); + timeByThreads<2>( p, N ); + timeByThreads<1>( p, N ); + timeByThreads<0>( p, N ); + + cudaFree( p ); return 0; Error: return 1; diff --git a/SMs/testShuffle.cu b/SMs/testShuffle.cu index 3b719a6..b1f329f 100644 --- a/SMs/testShuffle.cu +++ b/SMs/testShuffle.cu @@ -48,7 +48,7 @@ TestShuffle( int *out, const int *in, size_t N ) size_t i = blockIdx.x*blockDim.x+threadIdx.x; int value = (int) i;//in[i]; - out[i] = __shfl_up( value, 1 ); + out[i] = __shfl_up_sync( 0xffffffff, value, 1 ); } cudaError_t diff --git a/chLib/chError.h b/chLib/chError.h index 53553ad..b332dad 100644 --- a/chLib/chError.h +++ b/chLib/chError.h @@ -112,6 +112,40 @@ chGetErrorString( CUresult status ) ErrorValue(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) ErrorValue(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED) #endif + ErrorValue(CUDA_ERROR_STUB_LIBRARY) + ErrorValue(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED) + ErrorValue(CUDA_ERROR_DEVICE_NOT_LICENSED) + ErrorValue(CUDA_ERROR_INVALID_PTX) + ErrorValue(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT) + ErrorValue(CUDA_ERROR_NVLINK_UNCORRECTABLE) + ErrorValue(CUDA_ERROR_JIT_COMPILER_NOT_FOUND) + ErrorValue(CUDA_ERROR_JIT_COMPILATION_DISABLED) + ErrorValue(CUDA_ERROR_UNSUPPORTED_PTX_VERSION) + ErrorValue(CUDA_ERROR_ILLEGAL_STATE) + ErrorValue(CUDA_ERROR_ILLEGAL_ADDRESS) + ErrorValue(CUDA_ERROR_HARDWARE_STACK_ERROR) + ErrorValue(CUDA_ERROR_ILLEGAL_INSTRUCTION) + ErrorValue(CUDA_ERROR_MISALIGNED_ADDRESS) + ErrorValue(CUDA_ERROR_INVALID_ADDRESS_SPACE) + ErrorValue(CUDA_ERROR_INVALID_PC) + ErrorValue(CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE) + ErrorValue(CUDA_ERROR_NOT_PERMITTED) + ErrorValue(CUDA_ERROR_NOT_SUPPORTED) + ErrorValue(CUDA_ERROR_SYSTEM_NOT_READY) + ErrorValue(CUDA_ERROR_SYSTEM_DRIVER_MISMATCH) + ErrorValue(CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_INVALIDATED) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_MERGE) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNMATCHED) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNJOINED) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_ISOLATION) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_IMPLICIT) + ErrorValue(CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD) + ErrorValue(CUDA_ERROR_TIMEOUT) + ErrorValue(CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE) + ErrorValue(CUDA_ERROR_CAPTURED_EVENT) + ErrorValue(CUDA_ERROR_UNKNOWN) } return "chGetErrorString - unknown error value"; diff --git a/concurrency/breakevenDtoHMemcpy.cu b/concurrency/breakevenDtoHMemcpy.cu index 5d2dc0b..404ed5d 100644 --- a/concurrency/breakevenDtoHMemcpy.cu +++ b/concurrency/breakevenDtoHMemcpy.cu @@ -70,7 +70,7 @@ main( int argc, char *argv[] ) cuda(MemcpyAsync( hostInt, deviceInt, byteCount, cudaMemcpyDeviceToHost, NULL ) ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/breakevenHtoDMemcpy.cu b/concurrency/breakevenHtoDMemcpy.cu index 707683e..0ec1dfa 100644 --- a/concurrency/breakevenHtoDMemcpy.cu +++ b/concurrency/breakevenHtoDMemcpy.cu @@ -70,7 +70,7 @@ main( int argc, char *argv[] ) cuda(MemcpyAsync( deviceInt, hostInt, byteCount, cudaMemcpyHostToDevice, NULL ) ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/breakevenKernelAsync.cu b/concurrency/breakevenKernelAsync.cu index 763b22b..6067453 100644 --- a/concurrency/breakevenKernelAsync.cu +++ b/concurrency/breakevenKernelAsync.cu @@ -67,7 +67,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { WaitKernel<<<1,1>>>( 0, false ); } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); printf("Cycles\tus\n" ); for ( int cycles = 0; cycles < 2500; cycles += 100 ) { @@ -76,7 +76,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { WaitKernel<<<1,1>>>( cycles, false ); } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); chTimerGetTime( &stop ); double microseconds = 1e6*chTimerElapsedTime( &start, &stop ); double usPerLaunch = microseconds / (float) cIterations; diff --git a/concurrency/eventRecord.cu b/concurrency/eventRecord.cu index 689d6b7..91600fc 100644 --- a/concurrency/eventRecord.cu +++ b/concurrency/eventRecord.cu @@ -72,7 +72,7 @@ usPerLaunch( int cIterations, int cEvents ) cuda(EventRecord( events[j], NULL ) ); } } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); microseconds = 1e6*chTimerElapsedTime( &start, &stop ); diff --git a/concurrency/managedOverhead.cu b/concurrency/managedOverhead.cu index 7f0f341..517e0d9 100644 --- a/concurrency/managedOverhead.cu +++ b/concurrency/managedOverhead.cu @@ -73,7 +73,7 @@ usPerLaunch( int cIterations, size_t cPages=0 ) chTimerGetTime( &start ); for ( int i = 0; i < cIterations; i++ ) { NullKernel<<<1,1>>>(); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); if ( bTouch && 0 != p ) { for ( int iPage = 0; iPage < cPages; iPage++ ) { ((volatile unsigned char *) p)[iPage*pageSize] |= 1; diff --git a/concurrency/nullDtoHMemcpyAsync.cu b/concurrency/nullDtoHMemcpyAsync.cu index 92d8fc9..7153c22 100644 --- a/concurrency/nullDtoHMemcpyAsync.cu +++ b/concurrency/nullDtoHMemcpyAsync.cu @@ -60,7 +60,7 @@ main( int argc, char *argv[] ) cuda(MemcpyAsync( hostInt, deviceInt, sizeof(int), cudaMemcpyDeviceToHost, NULL ) ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/nullDtoHMemcpySync.cu b/concurrency/nullDtoHMemcpySync.cu index e3c3812..6bbd360 100644 --- a/concurrency/nullDtoHMemcpySync.cu +++ b/concurrency/nullDtoHMemcpySync.cu @@ -61,7 +61,7 @@ main( int argc, char *argv[] ) cuda(Memcpy( hostInt, deviceInt, sizeof(int), cudaMemcpyDeviceToHost ) ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); cIterations *= 2; } while ( chTimerElapsedTime( &start, &stop ) < 0.5f ) ; diff --git a/concurrency/nullHtoDMemcpyAsync.cu b/concurrency/nullHtoDMemcpyAsync.cu index d0a8cd2..c90e4bc 100644 --- a/concurrency/nullHtoDMemcpyAsync.cu +++ b/concurrency/nullHtoDMemcpyAsync.cu @@ -60,7 +60,7 @@ main( int argc, char *argv[] ) cuda(MemcpyAsync( deviceInt, hostInt, sizeof(int), cudaMemcpyHostToDevice, NULL ) ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/nullKernelAsync.cu b/concurrency/nullKernelAsync.cu index 0003fb6..0d3ca09 100644 --- a/concurrency/nullKernelAsync.cu +++ b/concurrency/nullKernelAsync.cu @@ -60,7 +60,7 @@ usPerLaunch( int cIterations ) for ( int i = 0; i < cIterations; i++ ) { NullKernel<<<1,1>>>(); } - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); chTimerGetTime( &stop ); microseconds = 1e6*chTimerElapsedTime( &start, &stop ); diff --git a/concurrency/nullKernelSync.cu b/concurrency/nullKernelSync.cu index 0b0b4dc..6b1c3c8 100644 --- a/concurrency/nullKernelSync.cu +++ b/concurrency/nullKernelSync.cu @@ -59,7 +59,7 @@ usPerLaunch( int cIterations ) chTimerGetTime( &start ); for ( int i = 0; i < cIterations; i++ ) { NullKernel<<<1,1>>>(); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); } chTimerGetTime( &stop ); diff --git a/concurrency/pageableMemcpyHtoD.cu b/concurrency/pageableMemcpyHtoD.cu index dc94cca..a2f03da 100644 --- a/concurrency/pageableMemcpyHtoD.cu +++ b/concurrency/pageableMemcpyHtoD.cu @@ -145,7 +145,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/pageableMemcpyHtoD16.cu b/concurrency/pageableMemcpyHtoD16.cu index a5059eb..b0c1f51 100644 --- a/concurrency/pageableMemcpyHtoD16.cu +++ b/concurrency/pageableMemcpyHtoD16.cu @@ -157,7 +157,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/pageableMemcpyHtoD16Blocking.cu b/concurrency/pageableMemcpyHtoD16Blocking.cu index d24dd29..f3973f4 100644 --- a/concurrency/pageableMemcpyHtoD16Blocking.cu +++ b/concurrency/pageableMemcpyHtoD16Blocking.cu @@ -156,7 +156,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/pageableMemcpyHtoD16Broken.cu b/concurrency/pageableMemcpyHtoD16Broken.cu index 9cd53b3..643d7a8 100644 --- a/concurrency/pageableMemcpyHtoD16Broken.cu +++ b/concurrency/pageableMemcpyHtoD16Broken.cu @@ -157,7 +157,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/pageableMemcpyHtoD16Synchronous.cu b/concurrency/pageableMemcpyHtoD16Synchronous.cu index fe09c5e..49a2441 100644 --- a/concurrency/pageableMemcpyHtoD16Synchronous.cu +++ b/concurrency/pageableMemcpyHtoD16Synchronous.cu @@ -157,7 +157,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/concurrency/peer2peerMemcpy.cu b/concurrency/peer2peerMemcpy.cu index 5d33d08..826f3b9 100644 --- a/concurrency/peer2peerMemcpy.cu +++ b/concurrency/peer2peerMemcpy.cu @@ -195,7 +195,7 @@ main( int argc, char *argv[] ) for ( int i = 0; i < cIterations; i++ ) { chMemcpyPeerToPeer( deviceInt[0], 0, deviceInt[1], 1, numInts*sizeof(int) ) ; } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); { diff --git a/corr/normalizedCrossCorrelation.cu b/corr/normalizedCrossCorrelation.cu index eea6511..1413bf7 100644 --- a/corr/normalizedCrossCorrelation.cu +++ b/corr/normalizedCrossCorrelation.cu @@ -476,7 +476,8 @@ main(int argc, char *argv[]) int sharedPitch; int sharedMem; - char *inputFilename = "coins.pgm"; + char defaultInputFilename[] = "coins.pgm"; + char *inputFilename = defaultInputFilename; char *outputFilename = NULL; cudaArray *pArrayImage = NULL; diff --git a/histogram/histogram.cu b/histogram/histogram.cu index 6234189..d0a3167 100644 --- a/histogram/histogram.cu +++ b/histogram/histogram.cu @@ -261,7 +261,8 @@ main(int argc, char *argv[]) dim3 threads; - char *inputFilename = "coins.pgm"; + char defaultInputFilename[] = "coins.pgm"; + char *inputFilename = defaultInputFilename; char *outputFilename = NULL; cudaArray *pArrayImage = NULL; diff --git a/memory/globalCopy.cu b/memory/globalCopy.cu index 06d63b3..7131380 100644 --- a/memory/globalCopy.cu +++ b/memory/globalCopy.cu @@ -114,7 +114,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn, GlobalCopy<<>>( deviceOut+bOffsetDst, deviceIn+bOffsetSrc, N-bOffsetDst-bOffsetSrc ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/memory/globalCopy2.cu b/memory/globalCopy2.cu index 9cf1199..7908949 100644 --- a/memory/globalCopy2.cu +++ b/memory/globalCopy2.cu @@ -160,7 +160,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn0, T *deviceIn1, GlobalCopy<<>>( deviceOut+bOffsetDst, deviceIn0+bOffsetSrc, deviceIn1+bOffsetSrc, N-bOffsetDst-bOffsetSrc ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/memory/globalRead.cu b/memory/globalRead.cu index f191953..b2aad68 100644 --- a/memory/globalRead.cu +++ b/memory/globalRead.cu @@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) GlobalReads<<>>( out, in+bOffset, N-bOffset, false ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/memory/globalReadTex.cu b/memory/globalReadTex.cu index 3829156..88cf64c 100644 --- a/memory/globalReadTex.cu +++ b/memory/globalReadTex.cu @@ -236,7 +236,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) GlobalReads<<>>( out, bOffset, N-bOffset, false ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/memory/globalWrite.cu b/memory/globalWrite.cu index be88461..f71cbc4 100644 --- a/memory/globalWrite.cu +++ b/memory/globalWrite.cu @@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads ) GlobalWrites<<>>( out+bOffset, (T) 0xcc, N-bOffset ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/memory/spinlockReduction.cu b/memory/spinlockReduction.cu index fd0a3fd..e08395b 100644 --- a/memory/spinlockReduction.cu +++ b/memory/spinlockReduction.cu @@ -208,7 +208,7 @@ AtomicsPerSecond( size_t N, int cBlocks, int cThreads ) } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); diff --git a/microbench/globalRead.cu b/microbench/globalRead.cu index 7041b92..2049814 100644 --- a/microbench/globalRead.cu +++ b/microbench/globalRead.cu @@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) GlobalReads<<>>( out, in+bOffset, N-bOffset, false ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/microbench/globalWrite.cu b/microbench/globalWrite.cu index d63cc18..ca19c31 100644 --- a/microbench/globalWrite.cu +++ b/microbench/globalWrite.cu @@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads ) GlobalWrites<<>>( out+bOffset, (T) 0xcc, N-bOffset ); } cudaEventRecord( evStop ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); // make configurations that cannot launch error-out with 0 bandwidth cuda(GetLastError() ); cuda(EventElapsedTime( &ms, evStart, evStop ) ); diff --git a/microbench/reportClocks.cu b/microbench/reportClocks.cu index e9598a2..20ab911 100644 --- a/microbench/reportClocks.cu +++ b/microbench/reportClocks.cu @@ -97,13 +97,13 @@ ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSi cuda(EventCreate( &stop ) ); WriteClockValues<<>>( deviceClockValues, deviceThreadIDs ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); cuda(EventRecord( start, 0 ) ); WriteClockValues<<>>( deviceClockValues, deviceThreadIDs ); cuda(EventRecord( stop, 0 ) ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); { float ms; diff --git a/nbody/nbody.cu b/nbody/nbody.cu index 7c1a9e0..56bc70b 100644 --- a/nbody/nbody.cu +++ b/nbody/nbody.cu @@ -7,8 +7,8 @@ * parallelizable, with lots of FLOPS per unit of external * memory bandwidth required. * - * Build with: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu - * On Linux: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lpthread -lrt + * Build with: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu + * On Linux: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lcudart_static -ldl -lrt * Requires: No minimum SM requirement. If SM 3.x is not available, * this application quietly replaces the shuffle and fast-atomic * implementations with the shared memory implementation. @@ -482,7 +482,7 @@ ComputeGravitation( sumY += g_hostAOS_Force[i*3+1]; sumZ += g_hostAOS_Force[i*3+2]; } - *maxRelError = max( fabs(sumX), max(fabs(sumY), fabs(sumZ)) ); + *maxRelError = std::max( fabs(sumX), std::max(fabs(sumY), fabs(sumZ)) ); if ( g_ZeroThreshold != 0.0 && fabs( *maxRelError ) > g_ZeroThreshold ) { printf( "Maximum sum of forces > threshold (%E > %E)\n", diff --git a/nbody/nbody_GPU_AOS_tiled.cuh b/nbody/nbody_GPU_AOS_tiled.cuh index 66d7c75..78b2ec3 100644 --- a/nbody/nbody_GPU_AOS_tiled.cuh +++ b/nbody/nbody_GPU_AOS_tiled.cuh @@ -78,11 +78,11 @@ inline float __device__ warpReduce( float x ) { - x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 8 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 4 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 2 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 1 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 8 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 4 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 2 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 1 ) ); return x; } @@ -111,10 +111,10 @@ DoNondiagonalTile_GPU( float fx, fy, fz; float4 bodyPosMass; - bodyPosMass.x = __shfl( shufSrcPosMass.x, _j ); - bodyPosMass.y = __shfl( shufSrcPosMass.y, _j ); - bodyPosMass.z = __shfl( shufSrcPosMass.z, _j ); - bodyPosMass.w = __shfl( shufSrcPosMass.w, _j ); + bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j ); + bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j ); + bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j ); + bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j ); bodyBodyInteraction( &fx, &fy, &fz, diff --git a/nbody/nbody_GPU_AOS_tiled_const.cuh b/nbody/nbody_GPU_AOS_tiled_const.cuh index 9295d94..ef31c6e 100644 --- a/nbody/nbody_GPU_AOS_tiled_const.cuh +++ b/nbody/nbody_GPU_AOS_tiled_const.cuh @@ -82,11 +82,11 @@ __device__ warpReduce_const( float x ) { #if __CUDA_ARCH__ && __CUDA_ARCH__ > 300 - x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 8 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 4 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 2 ) ); - x += __int_as_float( __shfl_xor( __float_as_int(x), 1 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 8 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 4 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 2 ) ); + x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 1 ) ); #endif return x; } @@ -117,10 +117,10 @@ DoNondiagonalTile_GPU_const( float fx, fy, fz; float4 bodyPosMass; - bodyPosMass.x = __shfl( shufSrcPosMass.x, _j ); - bodyPosMass.y = __shfl( shufSrcPosMass.y, _j ); - bodyPosMass.z = __shfl( shufSrcPosMass.z, _j ); - bodyPosMass.w = __shfl( shufSrcPosMass.w, _j ); + bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j ); + bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j ); + bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j ); + bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j ); bodyBodyInteraction( &fx, &fy, &fz, @@ -200,10 +200,10 @@ DoNondiagonalTile_GPU_const( float fx, fy, fz; float4 bodyPosMass; - bodyPosMass.x = __shfl( shufSrcPosMass.x, _j ); - bodyPosMass.y = __shfl( shufSrcPosMass.y, _j ); - bodyPosMass.z = __shfl( shufSrcPosMass.z, _j ); - bodyPosMass.w = __shfl( shufSrcPosMass.w, _j ); + bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j ); + bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j ); + bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j ); + bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j ); bodyBodyInteraction( &fx, &fy, &fz, diff --git a/nbody/nbody_GPU_SOA_tiled.cuh b/nbody/nbody_GPU_SOA_tiled.cuh index 1287a9d..959b3c2 100644 --- a/nbody/nbody_GPU_SOA_tiled.cuh +++ b/nbody/nbody_GPU_SOA_tiled.cuh @@ -98,10 +98,10 @@ DoNondiagonalTile_GPU_SOA( float fx, fy, fz; float4 bodyPosMass; - bodyPosMass.x = __shfl( shufSrcPosMass.x, _j ); - bodyPosMass.y = __shfl( shufSrcPosMass.y, _j ); - bodyPosMass.z = __shfl( shufSrcPosMass.z, _j ); - bodyPosMass.w = __shfl( shufSrcPosMass.w, _j ); + bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j ); + bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j ); + bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j ); + bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j ); bodyBodyInteraction( &fx, &fy, &fz, diff --git a/nbody/nbody_GPU_Shuffle.cuh b/nbody/nbody_GPU_Shuffle.cuh index 787b423..4e87f8c 100644 --- a/nbody/nbody_GPU_Shuffle.cuh +++ b/nbody/nbody_GPU_Shuffle.cuh @@ -56,10 +56,10 @@ ComputeNBodyGravitation_Shuffle( float fx, fy, fz; float4 shufDstPosMass; - shufDstPosMass.x = __shfl( shufSrcPosMass.x, k ); - shufDstPosMass.y = __shfl( shufSrcPosMass.y, k ); - shufDstPosMass.z = __shfl( shufSrcPosMass.z, k ); - shufDstPosMass.w = __shfl( shufSrcPosMass.w, k ); + shufDstPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, k ); + shufDstPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, k ); + shufDstPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, k ); + shufDstPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, k ); bodyBodyInteraction( &fx, &fy, &fz, diff --git a/reduction/reduction.cu b/reduction/reduction.cu index c8d9802..aa7605b 100644 --- a/reduction/reduction.cu +++ b/reduction/reduction.cu @@ -85,7 +85,7 @@ TimedReduction( cuda(Malloc( &partialSums, cBlocks*sizeof(int) ) ); cuda(EventCreate( &start ) ); cuda(EventCreate( &stop ) ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); cuda(EventRecord( start, 0 ) ); hostReduction( @@ -165,7 +165,7 @@ usPerInvocation( int cIterations, size_t N, for ( int i = 0; i < cIterations; i++ ) { pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); ret = chTimerElapsedTime( &start, &stop ); ret = (ret / (double) cIterations) * 1e6; diff --git a/reduction/reductionTemplated.cu b/reduction/reductionTemplated.cu index 8be6a23..285bf98 100644 --- a/reduction/reductionTemplated.cu +++ b/reduction/reductionTemplated.cu @@ -129,7 +129,7 @@ TimedReduction( cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) ); cuda(EventCreate( &start ) ); cuda(EventCreate( &stop ) ); - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); cuda(EventRecord( start, 0 ) ); hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads ); @@ -209,7 +209,7 @@ usPerInvocation( int cIterations, size_t N, for ( int i = 0; i < cIterations; i++ ) { pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 ); } - cuda(ThreadSynchronize() ); + cuda(DeviceSynchronize() ); chTimerGetTime( &stop ); ret = chTimerElapsedTime( &start, &stop ); ret = (ret / (double) cIterations) * 1e6; diff --git a/reduction/reductionWarpShuffle.cuh b/reduction/reductionWarpShuffle.cuh index 969e595..72cd018 100644 --- a/reduction/reductionWarpShuffle.cuh +++ b/reduction/reductionWarpShuffle.cuh @@ -61,11 +61,11 @@ Reduction2_kernel( int *out, const int *in, size_t N ) volatile int *wsSum = sPartials; if ( blockDim.x > 32 ) wsSum[tid] += wsSum[tid + 32]; int mySum = wsSum[tid]; - mySum += __shfl_xor( mySum, 16 ); - mySum += __shfl_xor( mySum, 8 ); - mySum += __shfl_xor( mySum, 4 ); - mySum += __shfl_xor( mySum, 2 ); - mySum += __shfl_xor( mySum, 1 ); + mySum += __shfl_xor_sync( 0xffffffff, mySum, 16 ); + mySum += __shfl_xor_sync( 0xffffffff, mySum, 8 ); + mySum += __shfl_xor_sync( 0xffffffff, mySum, 4 ); + mySum += __shfl_xor_sync( 0xffffffff, mySum, 2 ); + mySum += __shfl_xor_sync( 0xffffffff, mySum, 1 ); /* wsSum[tid] += wsSum[tid + 16]; wsSum[tid] += wsSum[tid + 8]; wsSum[tid] += wsSum[tid + 4]; diff --git a/scan/int/timeScan.cu b/scan/int/timeScan.cu index 81cf9fa..a82d06c 100644 --- a/scan/int/timeScan.cu +++ b/scan/int/timeScan.cu @@ -96,7 +96,7 @@ TimeScan( void (*pfnScanGPU)(T *, const T *, size_t, int), for ( int i = 0; i < cIterations; i++ ) { pfnScanGPU( outGPU, inGPU, N, numThreads ); } - if ( cudaSuccess != cudaThreadSynchronize() ) + if ( cudaSuccess != cudaDeviceSynchronize() ) goto Error; chTimerGetTime( &stop ); diff --git a/scan/streamCompact/timeStreamCompact_odd.cu b/scan/streamCompact/timeStreamCompact_odd.cu index 8d8a1ea..dafd0b1 100644 --- a/scan/streamCompact/timeStreamCompact_odd.cu +++ b/scan/streamCompact/timeStreamCompact_odd.cu @@ -122,7 +122,7 @@ TimeStreamCompact( for ( int i = 0; i < cIterations; i++ ) { pfnScanGPU( outGPU, deviceTotal, inGPU, N, numThreads ); } - if ( cudaSuccess != cudaThreadSynchronize() ) + if ( cudaSuccess != cudaDeviceSynchronize() ) goto Error; chTimerGetTime( &stop ); diff --git a/scan/warp/scanWarpShuffle.cuh b/scan/warp/scanWarpShuffle.cuh index 4573ac4..5521465 100644 --- a/scan/warp/scanWarpShuffle.cuh +++ b/scan/warp/scanWarpShuffle.cuh @@ -69,7 +69,7 @@ exclusive_scan_warp_shfl(int mysum) { const unsigned int lane = threadIdx.x & 31; for(int i = 0; i < levels; ++i) - mysum = shfl_scan_add_step(mysum, 1 << i); + mysum = scanWarpShuffle_step( mysum, 1 << i); mysum = __shfl_up(mysum, 1); return (lane) ? mysum : 0; } diff --git a/scan/warp/testScanWarp.cu b/scan/warp/testScanWarp.cu index 34865b7..11f5706 100644 --- a/scan/warp/testScanWarp.cu +++ b/scan/warp/testScanWarp.cu @@ -63,7 +63,7 @@ enum ScanType { #include "scanReduceThenScan.cuh" #include "scanReduceThenScan_0.cuh" #include "scan2Level.cuh" -#include "ScanThrust.cuh" +#include "scanThrust.cuh" template void diff --git a/texturing/surf2Dmemset.cu b/texturing/surf2Dmemset.cu index eb62aa1..6f0fa60 100644 --- a/texturing/surf2Dmemset.cu +++ b/texturing/surf2Dmemset.cu @@ -184,7 +184,7 @@ CreateAndPrintTex( blocks.y = 1; threads.x = 64; threads.y = 4; TexReadout<<>>( outDevice, outWidth, outPitch, outHeight, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int row = 0; row < outHeight; row++ ) { float4 *outrow = (float4 *) ((char *) outHost + row*outPitch); diff --git a/texturing/tex1d_9bit.cu b/texturing/tex1d_9bit.cu index a4bb5f2..df339d4 100644 --- a/texturing/tex1d_9bit.cu +++ b/texturing/tex1d_9bit.cu @@ -98,7 +98,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, tex.addressMode[0] = addressMode; cuda(HostGetDevicePointer(&outDevice, outHost, 0)); TexReadout<<<2,384>>>( outDevice, outN, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); printf( "X\tY\tActual Value\tExpected Value\tDiff\n" ); for ( int i = 0; i < outN; i++ ) { diff --git a/texturing/tex1d_addressing.cu b/texturing/tex1d_addressing.cu index 49d28f1..d3e2e0c 100644 --- a/texturing/tex1d_addressing.cu +++ b/texturing/tex1d_addressing.cu @@ -169,7 +169,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, tex.filterMode = filterMode; tex.addressMode[0] = addressMode; TexReadout<<<2,384>>>( outDevice, outN, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < outN; i++ ) { float x = base+(float)i*increment; diff --git a/texturing/tex1d_unnormalized.cu b/texturing/tex1d_unnormalized.cu index 98fbb07..61ccaa7 100644 --- a/texturing/tex1d_unnormalized.cu +++ b/texturing/tex1d_unnormalized.cu @@ -112,7 +112,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, tex.addressMode[0] = addressMode; cuda(HostGetDevicePointer(&outDevice, outHost, 0)); TexReadout<<<2,384>>>( outDevice, outN, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < outN; i++ ) { printf( "(%.2f, %.2f)\n", outHost[i].x, outHost[i].y ); diff --git a/texturing/tex1dfetch.cu b/texturing/tex1dfetch.cu index 25cd590..9e2627e 100644 --- a/texturing/tex1dfetch.cu +++ b/texturing/tex1dfetch.cu @@ -65,7 +65,7 @@ PrintTex( float *host, size_t N ) cuda(HostGetDevicePointer( (void **) &device, host, 0 )); TexReadout<<<2,384>>>( device, N ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < N; i++ ) { printf( "%.2f ", host[i] ); } diff --git a/texturing/tex1dfetch_big.cu b/texturing/tex1dfetch_big.cu index a9990c8..1b4364c 100644 --- a/texturing/tex1dfetch_big.cu +++ b/texturing/tex1dfetch_big.cu @@ -177,7 +177,7 @@ TexChecksum( int *out, int c, size_t N ) default: goto Error; } - if ( cudaSuccess != cudaThreadSynchronize() ) + if ( cudaSuccess != cudaDeviceSynchronize() ) goto Error; *out = checksumGPU(); ret = true; diff --git a/texturing/tex1dfetch_host.cu b/texturing/tex1dfetch_host.cu index 488fe61..7279f05 100644 --- a/texturing/tex1dfetch_host.cu +++ b/texturing/tex1dfetch_host.cu @@ -65,7 +65,7 @@ PrintTex( float *host, size_t N ) cuda(HostGetDevicePointer( (void **) &device, host, 0 )); TexReadout<<<2,384>>>( device, N ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < N; i++ ) { printf( "%.2f ", host[i] ); } diff --git a/texturing/tex1dfetch_htod.cu b/texturing/tex1dfetch_htod.cu index 19703b8..d98d508 100644 --- a/texturing/tex1dfetch_htod.cu +++ b/texturing/tex1dfetch_htod.cu @@ -68,7 +68,7 @@ MeasureBandwidth( void *out, size_t N, int blocks, int threads ) chTimerGetTime( &start ); TexReadout<<<2,384>>>( (float *) out, N ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); chTimerGetTime( &stop ); diff --git a/texturing/tex1dfetch_int2float.cu b/texturing/tex1dfetch_int2float.cu index f80be04..32c61a0 100644 --- a/texturing/tex1dfetch_int2float.cu +++ b/texturing/tex1dfetch_int2float.cu @@ -119,7 +119,7 @@ CheckTexPromoteToFloat( size_t N ) cudaCreateChannelDesc(), N*sizeof(T))); TexReadout<<<2,384>>>( foutDevice, N ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < N; i++ ) { printf( "%.2f ", foutHost[i] ); diff --git a/texturing/tex1dfetch_offset.cu b/texturing/tex1dfetch_offset.cu index 9c00f1b..f2e7223 100644 --- a/texturing/tex1dfetch_offset.cu +++ b/texturing/tex1dfetch_offset.cu @@ -66,7 +66,7 @@ CheckTex( float *hostOut, const float *in, size_t offset, size_t N ) cuda(HostGetDevicePointer( (void **) &deviceOut, hostOut, 0 )); TexReadout<<<2,384>>>( deviceOut, offset>>2, N ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int i = 0; i < N; i++ ) { if ( in[i] != hostOut[i] ) { printf( "Mismatch at index %d\n", i ); diff --git a/texturing/tex2d_addressing.cu b/texturing/tex2d_addressing.cu index 2d8feab..96f192d 100644 --- a/texturing/tex2d_addressing.cu +++ b/texturing/tex2d_addressing.cu @@ -139,7 +139,7 @@ CreateAndPrintTex( blocks.y = 1; threads.x = 64; threads.y = 4; TexReadout<<>>( outDevice, outWidth, outPitch, outHeight, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int row = 0; row < outHeight; row++ ) { float4 *outrow = (float4 *) ((char *) outHost + row*outPitch); diff --git a/texturing/tex2d_addressing_device.cu b/texturing/tex2d_addressing_device.cu index 9f52f6f..b629b7d 100644 --- a/texturing/tex2d_addressing_device.cu +++ b/texturing/tex2d_addressing_device.cu @@ -141,7 +141,7 @@ CreateAndPrintTex( blocks.y = 1; threads.x = 64; threads.y = 4; TexReadout<<>>( outDevice, outWidth, outPitch, outHeight, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int row = 0; row < outHeight; row++ ) { float4 *outrow = (float4 *) ((char *) outHost + row*outPitch); diff --git a/texturing/tex2d_memset.cu b/texturing/tex2d_memset.cu index 5ea168f..307a1aa 100644 --- a/texturing/tex2d_memset.cu +++ b/texturing/tex2d_memset.cu @@ -157,7 +157,7 @@ CreateAndPrintTex( blocks.y = 1; threads.x = 64; threads.y = 4; TexReadout<<>>( outDevice, outWidth, outPitch, outHeight, base, increment ); - cuda(ThreadSynchronize()); + cuda(DeviceSynchronize()); for ( int row = 0; row < outHeight; row++ ) { float4 *outrow = (float4 *) ((char *) outHost + row*outPitch); diff --git a/texturing/tex2d_opengl.cu b/texturing/tex2d_opengl.cu index bbf2b06..0626c18 100644 --- a/texturing/tex2d_opengl.cu +++ b/texturing/tex2d_opengl.cu @@ -189,7 +189,7 @@ void displayCB(void) /* function called whenever redisplay needed */ else { RenderTextureUnnormalized<<>>( g_deviceFrameBuffer, g_width, g_height ); } - if ( cudaSuccess != cudaThreadSynchronize() ) + if ( cudaSuccess != cudaDeviceSynchronize() ) return; glRasterPos2f( 0.0f, 0.0f ); glDrawPixels( g_width, g_height, GL_RGBA, GL_UNSIGNED_BYTE, g_hostFrameBuffer );