Browse Source

Fix bit rot, update to use modern idioms

nwilt-techdebt
Nicholas Wilt 4 years ago
parent
commit
f9a5490496
  1. 85
      SMs/divergence.cu
  2. 2
      SMs/testShuffle.cu
  3. 34
      chLib/chError.h
  4. 2
      concurrency/breakevenDtoHMemcpy.cu
  5. 2
      concurrency/breakevenHtoDMemcpy.cu
  6. 4
      concurrency/breakevenKernelAsync.cu
  7. 2
      concurrency/eventRecord.cu
  8. 2
      concurrency/managedOverhead.cu
  9. 2
      concurrency/nullDtoHMemcpyAsync.cu
  10. 2
      concurrency/nullDtoHMemcpySync.cu
  11. 2
      concurrency/nullHtoDMemcpyAsync.cu
  12. 2
      concurrency/nullKernelAsync.cu
  13. 2
      concurrency/nullKernelSync.cu
  14. 2
      concurrency/pageableMemcpyHtoD.cu
  15. 2
      concurrency/pageableMemcpyHtoD16.cu
  16. 2
      concurrency/pageableMemcpyHtoD16Blocking.cu
  17. 2
      concurrency/pageableMemcpyHtoD16Broken.cu
  18. 2
      concurrency/pageableMemcpyHtoD16Synchronous.cu
  19. 2
      concurrency/peer2peerMemcpy.cu
  20. 3
      corr/normalizedCrossCorrelation.cu
  21. 3
      histogram/histogram.cu
  22. 2
      memory/globalCopy.cu
  23. 2
      memory/globalCopy2.cu
  24. 2
      memory/globalRead.cu
  25. 2
      memory/globalReadTex.cu
  26. 2
      memory/globalWrite.cu
  27. 2
      memory/spinlockReduction.cu
  28. 2
      microbench/globalRead.cu
  29. 2
      microbench/globalWrite.cu
  30. 4
      microbench/reportClocks.cu
  31. 6
      nbody/nbody.cu
  32. 18
      nbody/nbody_GPU_AOS_tiled.cuh
  33. 26
      nbody/nbody_GPU_AOS_tiled_const.cuh
  34. 8
      nbody/nbody_GPU_SOA_tiled.cuh
  35. 8
      nbody/nbody_GPU_Shuffle.cuh
  36. 4
      reduction/reduction.cu
  37. 4
      reduction/reductionTemplated.cu
  38. 10
      reduction/reductionWarpShuffle.cuh
  39. 2
      scan/int/timeScan.cu
  40. 2
      scan/streamCompact/timeStreamCompact_odd.cu
  41. 2
      scan/warp/scanWarpShuffle.cuh
  42. 2
      scan/warp/testScanWarp.cu
  43. 2
      texturing/surf2Dmemset.cu
  44. 2
      texturing/tex1d_9bit.cu
  45. 2
      texturing/tex1d_addressing.cu
  46. 2
      texturing/tex1d_unnormalized.cu
  47. 2
      texturing/tex1dfetch.cu
  48. 2
      texturing/tex1dfetch_big.cu
  49. 2
      texturing/tex1dfetch_host.cu
  50. 2
      texturing/tex1dfetch_htod.cu
  51. 2
      texturing/tex1dfetch_int2float.cu
  52. 2
      texturing/tex1dfetch_offset.cu
  53. 2
      texturing/tex2d_addressing.cu
  54. 2
      texturing/tex2d_addressing_device.cu
  55. 2
      texturing/tex2d_memset.cu
  56. 2
      texturing/tex2d_opengl.cu

85
SMs/divergence.cu

@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
*
* Microdemo to measure performance implications of conditional code.
*
* Build with: nvcc [--gpu-architecture sm_xx] [-D USE_FLOAT] [-D USE_IF_STATEMENT] divergence.cu
* Build with: nvcc [--gpu-architecture sm_xx] divergence.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2021, Archaea Software, LLC.
@ -46,66 +46,79 @@ @@ -46,66 +46,79 @@
// apply blockDim and/or gridDim to n before passing in.
//
template<int base>
void __device__ sumFloats( float *p, size_t N, size_t n )
void __device__ sumInts( uint32_t *p, size_t N, size_t n )
{
float f = base;
for ( size_t i = 0; i < N; i++ ) {
*p += f;
*p += base;
p += n;
}
}
typedef void(*psumFloats)(float *, size_t, size_t);
typedef void(*psumInts)(uint32_t *, size_t, size_t);
__device__ psumFloats rgSumFloats[] = {
sumFloats< 0>, sumFloats< 1>, sumFloats< 2>, sumFloats< 3>,
sumFloats< 4>, sumFloats< 5>, sumFloats< 6>, sumFloats< 7>,
sumFloats< 8>, sumFloats< 9>, sumFloats<10>, sumFloats<11>,
sumFloats<12>, sumFloats<13>, sumFloats<14>, sumFloats<15>,
sumFloats<16>, sumFloats<17>, sumFloats<18>, sumFloats<19>,
sumFloats<20>, sumFloats<21>, sumFloats<22>, sumFloats<23>,
sumFloats<24>, sumFloats<25>, sumFloats<26>, sumFloats<27>,
sumFloats<28>, sumFloats<29>, sumFloats<30>, sumFloats<31> };
__device__ psumInts rgSumInts[] = {
sumInts< 0>, sumInts< 1>, sumInts< 2>, sumInts< 3>,
sumInts< 4>, sumInts< 5>, sumInts< 6>, sumInts< 7>,
sumInts< 8>, sumInts< 9>, sumInts<10>, sumInts<11>,
sumInts<12>, sumInts<13>, sumInts<14>, sumInts<15>,
sumInts<16>, sumInts<17>, sumInts<18>, sumInts<19>,
sumInts<20>, sumInts<21>, sumInts<22>, sumInts<23>,
sumInts<24>, sumInts<25>, sumInts<26>, sumInts<27>,
sumInts<28>, sumInts<29>, sumInts<30>, sumInts<31> };
template<uint32_t sh>
__global__ void
sumFloats_bywarp( float *p, size_t N )
sumInts_bythread( uint32_t *p, size_t N )
{
uint32_t warpid = threadIdx.x>>5;
uint32_t warpish_id = threadIdx.x>>sh;
N /= blockDim.x*gridDim.x;
rgSumFloats[warpid]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
rgSumInts[warpish_id&31]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
}
__global__ void
sumFloats_bythread( float *p, size_t N )
{
}
int
main()
template<uint32_t sh>
static double
timeByThreads( uint32_t *p, size_t N )
{
cudaError_t status;
size_t N = 1024*1024*1024UL;
float *p = 0;
float et;
float elapsed_time;
double ret = 0.0;
cudaEvent_t start = 0, stop = 0;
cuda(Malloc( (void **) &p, N*sizeof(float)) );
cuda(Memset( p, 0, N*sizeof(float)) );
cuda(EventCreate( &start ));
cuda(EventCreate( &stop ));
cuda(EventRecord( start ));
sumFloats_bywarp<<<3072,256>>>( p, N );
sumInts_bythread<sh><<<3072,1024>>>( p, N );
cuda(EventRecord( stop ));
cuda(DeviceSynchronize());
cuda(EventElapsedTime( &et, start, stop ));
printf( "%.2f ms = %.2f Gops/s\n", et, (double) N*1000.0/et/1e9 );
cudaFree( p );
cuda(EventElapsedTime( &elapsed_time, start, stop ));
ret = N*1000.0/elapsed_time/1e9;
printf( "%2d threads: %f Gops/s\n", 1<<sh, ret );
Error:
cudaEventDestroy( stop );
cudaEventDestroy( start );
return ret;
}
int
main()
{
cudaError_t status;
size_t N = 1024*1024*1024UL;
uint32_t *p = 0;
cuda(Malloc( (void **) &p, N*sizeof(uint32_t)) );
cuda(Memset( p, 0, N*sizeof(uint32_t)) );
timeByThreads<6>( p, N );
timeByThreads<5>( p, N );
timeByThreads<4>( p, N );
timeByThreads<3>( p, N );
timeByThreads<2>( p, N );
timeByThreads<1>( p, N );
timeByThreads<0>( p, N );
cudaFree( p );
return 0;
Error:
return 1;

2
SMs/testShuffle.cu

@ -48,7 +48,7 @@ TestShuffle( int *out, const int *in, size_t N ) @@ -48,7 +48,7 @@ TestShuffle( int *out, const int *in, size_t N )
size_t i = blockIdx.x*blockDim.x+threadIdx.x;
int value = (int) i;//in[i];
out[i] = __shfl_up( value, 1 );
out[i] = __shfl_up_sync( 0xffffffff, value, 1 );
}
cudaError_t

34
chLib/chError.h

@ -112,6 +112,40 @@ chGetErrorString( CUresult status ) @@ -112,6 +112,40 @@ chGetErrorString( CUresult status )
ErrorValue(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
ErrorValue(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED)
#endif
ErrorValue(CUDA_ERROR_STUB_LIBRARY)
ErrorValue(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
ErrorValue(CUDA_ERROR_DEVICE_NOT_LICENSED)
ErrorValue(CUDA_ERROR_INVALID_PTX)
ErrorValue(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT)
ErrorValue(CUDA_ERROR_NVLINK_UNCORRECTABLE)
ErrorValue(CUDA_ERROR_JIT_COMPILER_NOT_FOUND)
ErrorValue(CUDA_ERROR_JIT_COMPILATION_DISABLED)
ErrorValue(CUDA_ERROR_UNSUPPORTED_PTX_VERSION)
ErrorValue(CUDA_ERROR_ILLEGAL_STATE)
ErrorValue(CUDA_ERROR_ILLEGAL_ADDRESS)
ErrorValue(CUDA_ERROR_HARDWARE_STACK_ERROR)
ErrorValue(CUDA_ERROR_ILLEGAL_INSTRUCTION)
ErrorValue(CUDA_ERROR_MISALIGNED_ADDRESS)
ErrorValue(CUDA_ERROR_INVALID_ADDRESS_SPACE)
ErrorValue(CUDA_ERROR_INVALID_PC)
ErrorValue(CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
ErrorValue(CUDA_ERROR_NOT_PERMITTED)
ErrorValue(CUDA_ERROR_NOT_SUPPORTED)
ErrorValue(CUDA_ERROR_SYSTEM_NOT_READY)
ErrorValue(CUDA_ERROR_SYSTEM_DRIVER_MISMATCH)
ErrorValue(CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_INVALIDATED)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_MERGE)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNMATCHED)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_UNJOINED)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_ISOLATION)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_IMPLICIT)
ErrorValue(CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD)
ErrorValue(CUDA_ERROR_TIMEOUT)
ErrorValue(CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE)
ErrorValue(CUDA_ERROR_CAPTURED_EVENT)
ErrorValue(CUDA_ERROR_UNKNOWN)
}
return "chGetErrorString - unknown error value";

2
concurrency/breakevenDtoHMemcpy.cu

@ -70,7 +70,7 @@ main( int argc, char *argv[] ) @@ -70,7 +70,7 @@ main( int argc, char *argv[] )
cuda(MemcpyAsync( hostInt, deviceInt, byteCount,
cudaMemcpyDeviceToHost, NULL ) );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/breakevenHtoDMemcpy.cu

@ -70,7 +70,7 @@ main( int argc, char *argv[] ) @@ -70,7 +70,7 @@ main( int argc, char *argv[] )
cuda(MemcpyAsync( deviceInt, hostInt, byteCount,
cudaMemcpyHostToDevice, NULL ) );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

4
concurrency/breakevenKernelAsync.cu

@ -67,7 +67,7 @@ main( int argc, char *argv[] ) @@ -67,7 +67,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
WaitKernel<<<1,1>>>( 0, false );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
printf("Cycles\tus\n" );
for ( int cycles = 0; cycles < 2500; cycles += 100 ) {
@ -76,7 +76,7 @@ main( int argc, char *argv[] ) @@ -76,7 +76,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
WaitKernel<<<1,1>>>( cycles, false );
}
cudaThreadSynchronize();
cudaDeviceSynchronize();
chTimerGetTime( &stop );
double microseconds = 1e6*chTimerElapsedTime( &start, &stop );
double usPerLaunch = microseconds / (float) cIterations;

2
concurrency/eventRecord.cu

@ -72,7 +72,7 @@ usPerLaunch( int cIterations, int cEvents ) @@ -72,7 +72,7 @@ usPerLaunch( int cIterations, int cEvents )
cuda(EventRecord( events[j], NULL ) );
}
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
microseconds = 1e6*chTimerElapsedTime( &start, &stop );

2
concurrency/managedOverhead.cu

@ -73,7 +73,7 @@ usPerLaunch( int cIterations, size_t cPages=0 ) @@ -73,7 +73,7 @@ usPerLaunch( int cIterations, size_t cPages=0 )
chTimerGetTime( &start );
for ( int i = 0; i < cIterations; i++ ) {
NullKernel<<<1,1>>>();
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
if ( bTouch && 0 != p ) {
for ( int iPage = 0; iPage < cPages; iPage++ ) {
((volatile unsigned char *) p)[iPage*pageSize] |= 1;

2
concurrency/nullDtoHMemcpyAsync.cu

@ -60,7 +60,7 @@ main( int argc, char *argv[] ) @@ -60,7 +60,7 @@ main( int argc, char *argv[] )
cuda(MemcpyAsync( hostInt, deviceInt, sizeof(int),
cudaMemcpyDeviceToHost, NULL ) );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/nullDtoHMemcpySync.cu

@ -61,7 +61,7 @@ main( int argc, char *argv[] ) @@ -61,7 +61,7 @@ main( int argc, char *argv[] )
cuda(Memcpy( hostInt, deviceInt, sizeof(int),
cudaMemcpyDeviceToHost ) );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
cIterations *= 2;
} while ( chTimerElapsedTime( &start, &stop ) < 0.5f ) ;

2
concurrency/nullHtoDMemcpyAsync.cu

@ -60,7 +60,7 @@ main( int argc, char *argv[] ) @@ -60,7 +60,7 @@ main( int argc, char *argv[] )
cuda(MemcpyAsync( deviceInt, hostInt, sizeof(int),
cudaMemcpyHostToDevice, NULL ) );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/nullKernelAsync.cu

@ -60,7 +60,7 @@ usPerLaunch( int cIterations ) @@ -60,7 +60,7 @@ usPerLaunch( int cIterations )
for ( int i = 0; i < cIterations; i++ ) {
NullKernel<<<1,1>>>();
}
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
chTimerGetTime( &stop );
microseconds = 1e6*chTimerElapsedTime( &start, &stop );

2
concurrency/nullKernelSync.cu

@ -59,7 +59,7 @@ usPerLaunch( int cIterations ) @@ -59,7 +59,7 @@ usPerLaunch( int cIterations )
chTimerGetTime( &start );
for ( int i = 0; i < cIterations; i++ ) {
NullKernel<<<1,1>>>();
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
}
chTimerGetTime( &stop );

2
concurrency/pageableMemcpyHtoD.cu

@ -145,7 +145,7 @@ main( int argc, char *argv[] ) @@ -145,7 +145,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/pageableMemcpyHtoD16.cu

@ -157,7 +157,7 @@ main( int argc, char *argv[] ) @@ -157,7 +157,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/pageableMemcpyHtoD16Blocking.cu

@ -156,7 +156,7 @@ main( int argc, char *argv[] ) @@ -156,7 +156,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/pageableMemcpyHtoD16Broken.cu

@ -157,7 +157,7 @@ main( int argc, char *argv[] ) @@ -157,7 +157,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/pageableMemcpyHtoD16Synchronous.cu

@ -157,7 +157,7 @@ main( int argc, char *argv[] ) @@ -157,7 +157,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyHtoD( deviceInt, testVector, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

2
concurrency/peer2peerMemcpy.cu

@ -195,7 +195,7 @@ main( int argc, char *argv[] ) @@ -195,7 +195,7 @@ main( int argc, char *argv[] )
for ( int i = 0; i < cIterations; i++ ) {
chMemcpyPeerToPeer( deviceInt[0], 0, deviceInt[1], 1, numInts*sizeof(int) ) ;
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
{

3
corr/normalizedCrossCorrelation.cu

@ -476,7 +476,8 @@ main(int argc, char *argv[]) @@ -476,7 +476,8 @@ main(int argc, char *argv[])
int sharedPitch;
int sharedMem;
char *inputFilename = "coins.pgm";
char defaultInputFilename[] = "coins.pgm";
char *inputFilename = defaultInputFilename;
char *outputFilename = NULL;
cudaArray *pArrayImage = NULL;

3
histogram/histogram.cu

@ -261,7 +261,8 @@ main(int argc, char *argv[]) @@ -261,7 +261,8 @@ main(int argc, char *argv[])
dim3 threads;
char *inputFilename = "coins.pgm";
char defaultInputFilename[] = "coins.pgm";
char *inputFilename = defaultInputFilename;
char *outputFilename = NULL;
cudaArray *pArrayImage = NULL;

2
memory/globalCopy.cu

@ -114,7 +114,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn, @@ -114,7 +114,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn,
GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
memory/globalCopy2.cu

@ -160,7 +160,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn0, T *deviceIn1, @@ -160,7 +160,7 @@ BandwidthCopy( T *deviceOut, T *deviceIn0, T *deviceIn1,
GlobalCopy<T,n><<<cBlocks,cThreads>>>( deviceOut+bOffsetDst, deviceIn0+bOffsetSrc, deviceIn1+bOffsetSrc, N-bOffsetDst-bOffsetSrc );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
memory/globalRead.cu

@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) @@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
memory/globalReadTex.cu

@ -236,7 +236,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) @@ -236,7 +236,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, bOffset, N-bOffset, false );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
memory/globalWrite.cu

@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads ) @@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
memory/spinlockReduction.cu

@ -208,7 +208,7 @@ AtomicsPerSecond( size_t N, int cBlocks, int cThreads ) @@ -208,7 +208,7 @@ AtomicsPerSecond( size_t N, int cBlocks, int cThreads )
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );

2
microbench/globalRead.cu

@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads ) @@ -179,7 +179,7 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

2
microbench/globalWrite.cu

@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads ) @@ -131,7 +131,7 @@ BandwidthWrites( size_t N, int cBlocks, int cThreads )
GlobalWrites<T,n><<<cBlocks,cThreads>>>( out+bOffset, (T) 0xcc, N-bOffset );
}
cudaEventRecord( evStop );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );

4
microbench/reportClocks.cu

@ -97,13 +97,13 @@ ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSi @@ -97,13 +97,13 @@ ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSi
cuda(EventCreate( &stop ) );
WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
cuda(EventRecord( start, 0 ) );
WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
cuda(EventRecord( stop, 0 ) );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
{
float ms;

6
nbody/nbody.cu

@ -7,8 +7,8 @@ @@ -7,8 +7,8 @@
* parallelizable, with lots of FLOPS per unit of external
* memory bandwidth required.
*
* Build with: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
* On Linux: nvcc -I ../chLib <options> nbody.cu nbody_CPU_SSE.cpp nbody_CPU_SSE_threaded.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lpthread -lrt
* Build with: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu
* On Linux: nvcc -I ../chLib nbody.cu nbody_CPU_SSE.cpp nbody_CPU_AOS.cpp nbody_CPU_AOS_tiled.cpp nbody_CPU_SSE_threaded.cpp nbody_CPU_SOA.cpp nbody_GPU_shared.cu nbody_multiGPU.cu nbody_multiGPU_threaded.cu -lcudart_static -ldl -lrt
* Requires: No minimum SM requirement. If SM 3.x is not available,
* this application quietly replaces the shuffle and fast-atomic
* implementations with the shared memory implementation.
@ -482,7 +482,7 @@ ComputeGravitation( @@ -482,7 +482,7 @@ ComputeGravitation(
sumY += g_hostAOS_Force[i*3+1];
sumZ += g_hostAOS_Force[i*3+2];
}
*maxRelError = max( fabs(sumX), max(fabs(sumY), fabs(sumZ)) );
*maxRelError = std::max( fabs(sumX), std::max(fabs(sumY), fabs(sumZ)) );
if ( g_ZeroThreshold != 0.0 &&
fabs( *maxRelError ) > g_ZeroThreshold ) {
printf( "Maximum sum of forces > threshold (%E > %E)\n",

18
nbody/nbody_GPU_AOS_tiled.cuh

@ -78,11 +78,11 @@ inline float @@ -78,11 +78,11 @@ inline float
__device__
warpReduce( float x )
{
x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 8 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 4 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 2 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 1 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 8 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 4 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 2 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 1 ) );
return x;
}
@ -111,10 +111,10 @@ DoNondiagonalTile_GPU( @@ -111,10 +111,10 @@ DoNondiagonalTile_GPU(
float fx, fy, fz;
float4 bodyPosMass;
bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
bodyBodyInteraction<float>(
&fx, &fy, &fz,

26
nbody/nbody_GPU_AOS_tiled_const.cuh

@ -82,11 +82,11 @@ __device__ @@ -82,11 +82,11 @@ __device__
warpReduce_const( float x )
{
#if __CUDA_ARCH__ && __CUDA_ARCH__ > 300
x += __int_as_float( __shfl_xor( __float_as_int(x), 16 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 8 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 4 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 2 ) );
x += __int_as_float( __shfl_xor( __float_as_int(x), 1 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 16 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 8 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 4 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 2 ) );
x += __int_as_float( __shfl_xor_sync( 0xffffffff, __float_as_int(x), 1 ) );
#endif
return x;
}
@ -117,10 +117,10 @@ DoNondiagonalTile_GPU_const( @@ -117,10 +117,10 @@ DoNondiagonalTile_GPU_const(
float fx, fy, fz;
float4 bodyPosMass;
bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
bodyBodyInteraction<float>(
&fx, &fy, &fz,
@ -200,10 +200,10 @@ DoNondiagonalTile_GPU_const( @@ -200,10 +200,10 @@ DoNondiagonalTile_GPU_const(
float fx, fy, fz;
float4 bodyPosMass;
bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
bodyBodyInteraction<float>(
&fx, &fy, &fz,

8
nbody/nbody_GPU_SOA_tiled.cuh

@ -98,10 +98,10 @@ DoNondiagonalTile_GPU_SOA( @@ -98,10 +98,10 @@ DoNondiagonalTile_GPU_SOA(
float fx, fy, fz;
float4 bodyPosMass;
bodyPosMass.x = __shfl( shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl( shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl( shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl( shufSrcPosMass.w, _j );
bodyPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, _j );
bodyPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, _j );
bodyPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, _j );
bodyPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, _j );
bodyBodyInteraction<float>(
&fx, &fy, &fz,

8
nbody/nbody_GPU_Shuffle.cuh

@ -56,10 +56,10 @@ ComputeNBodyGravitation_Shuffle( @@ -56,10 +56,10 @@ ComputeNBodyGravitation_Shuffle(
float fx, fy, fz;
float4 shufDstPosMass;
shufDstPosMass.x = __shfl( shufSrcPosMass.x, k );
shufDstPosMass.y = __shfl( shufSrcPosMass.y, k );
shufDstPosMass.z = __shfl( shufSrcPosMass.z, k );
shufDstPosMass.w = __shfl( shufSrcPosMass.w, k );
shufDstPosMass.x = __shfl_sync( 0xffffffff, shufSrcPosMass.x, k );
shufDstPosMass.y = __shfl_sync( 0xffffffff, shufSrcPosMass.y, k );
shufDstPosMass.z = __shfl_sync( 0xffffffff, shufSrcPosMass.z, k );
shufDstPosMass.w = __shfl_sync( 0xffffffff, shufSrcPosMass.w, k );
bodyBodyInteraction(
&fx, &fy, &fz,

4
reduction/reduction.cu

@ -85,7 +85,7 @@ TimedReduction( @@ -85,7 +85,7 @@ TimedReduction(
cuda(Malloc( &partialSums, cBlocks*sizeof(int) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
cuda(EventRecord( start, 0 ) );
hostReduction(
@ -165,7 +165,7 @@ usPerInvocation( int cIterations, size_t N, @@ -165,7 +165,7 @@ usPerInvocation( int cIterations, size_t N,
for ( int i = 0; i < cIterations; i++ ) {
pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
ret = chTimerElapsedTime( &start, &stop );
ret = (ret / (double) cIterations) * 1e6;

4
reduction/reductionTemplated.cu

@ -129,7 +129,7 @@ TimedReduction( @@ -129,7 +129,7 @@ TimedReduction(
cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
cuda(EventRecord( start, 0 ) );
hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads );
@ -209,7 +209,7 @@ usPerInvocation( int cIterations, size_t N, @@ -209,7 +209,7 @@ usPerInvocation( int cIterations, size_t N,
for ( int i = 0; i < cIterations; i++ ) {
pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
}
cuda(ThreadSynchronize() );
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
ret = chTimerElapsedTime( &start, &stop );
ret = (ret / (double) cIterations) * 1e6;

10
reduction/reductionWarpShuffle.cuh

@ -61,11 +61,11 @@ Reduction2_kernel( int *out, const int *in, size_t N ) @@ -61,11 +61,11 @@ Reduction2_kernel( int *out, const int *in, size_t N )
volatile int *wsSum = sPartials;
if ( blockDim.x > 32 ) wsSum[tid] += wsSum[tid + 32];
int mySum = wsSum[tid];
mySum += __shfl_xor( mySum, 16 );
mySum += __shfl_xor( mySum, 8 );
mySum += __shfl_xor( mySum, 4 );
mySum += __shfl_xor( mySum, 2 );
mySum += __shfl_xor( mySum, 1 );
mySum += __shfl_xor_sync( 0xffffffff, mySum, 16 );
mySum += __shfl_xor_sync( 0xffffffff, mySum, 8 );
mySum += __shfl_xor_sync( 0xffffffff, mySum, 4 );
mySum += __shfl_xor_sync( 0xffffffff, mySum, 2 );
mySum += __shfl_xor_sync( 0xffffffff, mySum, 1 );
/* wsSum[tid] += wsSum[tid + 16];
wsSum[tid] += wsSum[tid + 8];
wsSum[tid] += wsSum[tid + 4];

2
scan/int/timeScan.cu

@ -96,7 +96,7 @@ TimeScan( void (*pfnScanGPU)(T *, const T *, size_t, int), @@ -96,7 +96,7 @@ TimeScan( void (*pfnScanGPU)(T *, const T *, size_t, int),
for ( int i = 0; i < cIterations; i++ ) {
pfnScanGPU( outGPU, inGPU, N, numThreads );
}
if ( cudaSuccess != cudaThreadSynchronize() )
if ( cudaSuccess != cudaDeviceSynchronize() )
goto Error;
chTimerGetTime( &stop );

2
scan/streamCompact/timeStreamCompact_odd.cu

@ -122,7 +122,7 @@ TimeStreamCompact( @@ -122,7 +122,7 @@ TimeStreamCompact(
for ( int i = 0; i < cIterations; i++ ) {
pfnScanGPU( outGPU, deviceTotal, inGPU, N, numThreads );
}
if ( cudaSuccess != cudaThreadSynchronize() )
if ( cudaSuccess != cudaDeviceSynchronize() )
goto Error;
chTimerGetTime( &stop );

2
scan/warp/scanWarpShuffle.cuh

@ -69,7 +69,7 @@ exclusive_scan_warp_shfl(int mysum) @@ -69,7 +69,7 @@ exclusive_scan_warp_shfl(int mysum)
{
const unsigned int lane = threadIdx.x & 31;
for(int i = 0; i < levels; ++i)
mysum = shfl_scan_add_step(mysum, 1 << i);
mysum = scanWarpShuffle_step( mysum, 1 << i);
mysum = __shfl_up(mysum, 1);
return (lane) ? mysum : 0;
}

2
scan/warp/testScanWarp.cu

@ -63,7 +63,7 @@ enum ScanType { @@ -63,7 +63,7 @@ enum ScanType {
#include "scanReduceThenScan.cuh"
#include "scanReduceThenScan_0.cuh"
#include "scan2Level.cuh"
#include "ScanThrust.cuh"
#include "scanThrust.cuh"
template<int period>
void

2
texturing/surf2Dmemset.cu

@ -184,7 +184,7 @@ CreateAndPrintTex( @@ -184,7 +184,7 @@ CreateAndPrintTex(
blocks.y = 1;
threads.x = 64; threads.y = 4;
TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int row = 0; row < outHeight; row++ ) {
float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);

2
texturing/tex1d_9bit.cu

@ -98,7 +98,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, @@ -98,7 +98,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
tex.addressMode[0] = addressMode;
cuda(HostGetDevicePointer(&outDevice, outHost, 0));
TexReadout<<<2,384>>>( outDevice, outN, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
printf( "X\tY\tActual Value\tExpected Value\tDiff\n" );
for ( int i = 0; i < outN; i++ ) {

2
texturing/tex1d_addressing.cu

@ -169,7 +169,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, @@ -169,7 +169,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
tex.filterMode = filterMode;
tex.addressMode[0] = addressMode;
TexReadout<<<2,384>>>( outDevice, outN, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < outN; i++ ) {
float x = base+(float)i*increment;

2
texturing/tex1d_unnormalized.cu

@ -112,7 +112,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN, @@ -112,7 +112,7 @@ CreateAndPrintTex( T *initTex, size_t texN, size_t outN,
tex.addressMode[0] = addressMode;
cuda(HostGetDevicePointer(&outDevice, outHost, 0));
TexReadout<<<2,384>>>( outDevice, outN, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < outN; i++ ) {
printf( "(%.2f, %.2f)\n", outHost[i].x, outHost[i].y );

2
texturing/tex1dfetch.cu

@ -65,7 +65,7 @@ PrintTex( float *host, size_t N ) @@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
TexReadout<<<2,384>>>( device, N );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < N; i++ ) {
printf( "%.2f ", host[i] );
}

2
texturing/tex1dfetch_big.cu

@ -177,7 +177,7 @@ TexChecksum( int *out, int c, size_t N ) @@ -177,7 +177,7 @@ TexChecksum( int *out, int c, size_t N )
default:
goto Error;
}
if ( cudaSuccess != cudaThreadSynchronize() )
if ( cudaSuccess != cudaDeviceSynchronize() )
goto Error;
*out = checksumGPU();
ret = true;

2
texturing/tex1dfetch_host.cu

@ -65,7 +65,7 @@ PrintTex( float *host, size_t N ) @@ -65,7 +65,7 @@ PrintTex( float *host, size_t N )
cuda(HostGetDevicePointer( (void **) &device, host, 0 ));
TexReadout<<<2,384>>>( device, N );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < N; i++ ) {
printf( "%.2f ", host[i] );
}

2
texturing/tex1dfetch_htod.cu

@ -68,7 +68,7 @@ MeasureBandwidth( void *out, size_t N, int blocks, int threads ) @@ -68,7 +68,7 @@ MeasureBandwidth( void *out, size_t N, int blocks, int threads )
chTimerGetTime( &start );
TexReadout<<<2,384>>>( (float *) out, N );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
chTimerGetTime( &stop );

2
texturing/tex1dfetch_int2float.cu

@ -119,7 +119,7 @@ CheckTexPromoteToFloat( size_t N ) @@ -119,7 +119,7 @@ CheckTexPromoteToFloat( size_t N )
cudaCreateChannelDesc<T>(),
N*sizeof(T)));
TexReadout<<<2,384>>>( foutDevice, N );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < N; i++ ) {
printf( "%.2f ", foutHost[i] );

2
texturing/tex1dfetch_offset.cu

@ -66,7 +66,7 @@ CheckTex( float *hostOut, const float *in, size_t offset, size_t N ) @@ -66,7 +66,7 @@ CheckTex( float *hostOut, const float *in, size_t offset, size_t N )
cuda(HostGetDevicePointer( (void **) &deviceOut, hostOut, 0 ));
TexReadout<<<2,384>>>( deviceOut, offset>>2, N );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int i = 0; i < N; i++ ) {
if ( in[i] != hostOut[i] ) {
printf( "Mismatch at index %d\n", i );

2
texturing/tex2d_addressing.cu

@ -139,7 +139,7 @@ CreateAndPrintTex( @@ -139,7 +139,7 @@ CreateAndPrintTex(
blocks.y = 1;
threads.x = 64; threads.y = 4;
TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int row = 0; row < outHeight; row++ ) {
float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);

2
texturing/tex2d_addressing_device.cu

@ -141,7 +141,7 @@ CreateAndPrintTex( @@ -141,7 +141,7 @@ CreateAndPrintTex(
blocks.y = 1;
threads.x = 64; threads.y = 4;
TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int row = 0; row < outHeight; row++ ) {
float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);

2
texturing/tex2d_memset.cu

@ -157,7 +157,7 @@ CreateAndPrintTex( @@ -157,7 +157,7 @@ CreateAndPrintTex(
blocks.y = 1;
threads.x = 64; threads.y = 4;
TexReadout<<<blocks,threads>>>( outDevice, outWidth, outPitch, outHeight, base, increment );
cuda(ThreadSynchronize());
cuda(DeviceSynchronize());
for ( int row = 0; row < outHeight; row++ ) {
float4 *outrow = (float4 *) ((char *) outHost + row*outPitch);

2
texturing/tex2d_opengl.cu

@ -189,7 +189,7 @@ void displayCB(void) /* function called whenever redisplay needed */ @@ -189,7 +189,7 @@ void displayCB(void) /* function called whenever redisplay needed */
else {
RenderTextureUnnormalized<<<g_height, 384>>>( g_deviceFrameBuffer, g_width, g_height );
}
if ( cudaSuccess != cudaThreadSynchronize() )
if ( cudaSuccess != cudaDeviceSynchronize() )
return;
glRasterPos2f( 0.0f, 0.0f );
glDrawPixels( g_width, g_height, GL_RGBA, GL_UNSIGNED_BYTE, g_hostFrameBuffer );

Loading…
Cancel
Save