You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
287 lines
9.2 KiB
287 lines
9.2 KiB
/* |
|
* |
|
* globalRead.cu |
|
* |
|
* Microbenchmark for read bandwidth from global memory. |
|
* |
|
* Build with: nvcc -I ../chLib <options> globalRead.cu |
|
* Requires: No minimum SM requirement. |
|
* |
|
* Copyright (c) 2011-2012, Archaea Software, LLC. |
|
* All rights reserved. |
|
|
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions |
|
* are met: |
|
|
|
* 1. Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* 2. Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in |
|
* the documentation and/or other materials provided with the |
|
* distribution. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
* POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
*/ |
|
|
|
#include <stdio.h> |
|
|
|
#include <chError.h> |
|
#include <chCommandLine.h> |
|
|
|
template<class T> |
|
__device__ __host__ T |
|
plus( const T& a, const T& b ) |
|
{ |
|
T ret = a; |
|
ret += b; |
|
return ret; |
|
} |
|
|
|
struct myInt2 { |
|
int2 i2; |
|
|
|
__host__ __device__ myInt2() { } |
|
__host__ __device__ myInt2( int i ) { i2.x = i2.y = i; } |
|
}; |
|
|
|
template<> |
|
__device__ __host__ myInt2 |
|
plus( const myInt2& a, const myInt2& b ) |
|
{ |
|
myInt2 ret; |
|
ret.i2.x = a.i2.x + b.i2.x; |
|
ret.i2.y = a.i2.y + b.i2.y; |
|
return ret; |
|
} |
|
|
|
struct myInt4 { |
|
int4 i4; |
|
|
|
__host__ __device__ myInt4() { } |
|
__host__ __device__ myInt4( int i ) { i4.x = i4.y = i4.z = i4.w = i; } |
|
}; |
|
|
|
template<> |
|
__device__ __host__ myInt4 |
|
plus( const myInt4& a, const myInt4& b ) |
|
{ |
|
myInt4 ret; |
|
ret.i4.x = a.i4.x + b.i4.x; |
|
ret.i4.y = a.i4.y + b.i4.y; |
|
ret.i4.z = a.i4.z + b.i4.z; |
|
ret.i4.w = a.i4.w + b.i4.w; |
|
return ret; |
|
} |
|
|
|
template<class T, const int n> |
|
__global__ void |
|
GlobalReads( T *out, const T *in, size_t N, bool bWriteResults ) |
|
{ |
|
T sums[n]; |
|
size_t i; |
|
for ( int j = 0; j < n; j++ ) { |
|
sums[j] = T(0); |
|
} |
|
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; |
|
i < N-n*blockDim.x*gridDim.x; |
|
i += n*blockDim.x*gridDim.x ) { |
|
for ( int j = 0; j < n; j++ ) { |
|
size_t index = i+j*blockDim.x; |
|
sums[j] = plus( sums[j], in[index] ); |
|
} |
|
} |
|
// to avoid the (index<N) conditional in the inner loop, |
|
// we left off some work at the end |
|
for ( int j = 0; j < n; j++ ) { |
|
size_t index = i+j*blockDim.x; |
|
if ( index<N ) sums[j] = plus( sums[j], in[index] ); |
|
} |
|
|
|
if ( bWriteResults ) { |
|
T sum = T(0); |
|
for ( int j = 0; j < n; j++ ) { |
|
sum = plus( sum, sums[j] ); |
|
} |
|
out[blockIdx.x*blockDim.x+threadIdx.x] = sum; |
|
} |
|
} |
|
|
|
template<class T, const int n, bool bOffset> |
|
double |
|
BandwidthReads( size_t N, int cBlocks, int cThreads ) |
|
{ |
|
T *in = 0; |
|
T *out = 0; |
|
T *hostIn = 0; |
|
T *hostOut = 0; |
|
double ret = 0.0; |
|
double elapsedTime; |
|
float ms; |
|
int cIterations; |
|
cudaError_t status; |
|
T sumCPU; |
|
cudaEvent_t evStart = 0; |
|
cudaEvent_t evStop = 0; |
|
|
|
cuda(Malloc( &in, N*sizeof(T) ) ); |
|
cuda(Malloc( &out, cBlocks*cThreads*sizeof(T) ) ); |
|
|
|
hostIn = new T[N]; |
|
if ( ! hostIn ) |
|
goto Error; |
|
hostOut = new T[cBlocks*cThreads]; |
|
if ( ! hostOut ) |
|
goto Error; |
|
|
|
sumCPU = T(0); |
|
// populate input array with random numbers |
|
for ( size_t i = bOffset; i < N; i++ ) { |
|
T nextrand = T(rand()); |
|
sumCPU = plus( sumCPU, nextrand ); |
|
hostIn[i] = nextrand; |
|
} |
|
|
|
cuda(Memcpy( in, hostIn, N*sizeof(T), cudaMemcpyHostToDevice ) ); |
|
cuda(EventCreate( &evStart ) ); |
|
cuda(EventCreate( &evStop ) ); |
|
|
|
{ |
|
// confirm that kernel launch with this configuration writes correct result |
|
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, true ); |
|
cuda(Memcpy( hostOut, out, cBlocks*cThreads*sizeof(T), cudaMemcpyDeviceToHost ) ); |
|
cuda(GetLastError() ); |
|
T sumGPU = T(0); |
|
for ( size_t i = 0; i < cBlocks*cThreads; i++ ) { |
|
sumGPU = plus( sumGPU, hostOut[i] ); |
|
} |
|
if ( memcmp( &sumCPU, &sumGPU, sizeof(T) ) ) { |
|
printf( "Incorrect sum computed!\n" ); |
|
goto Error; |
|
} |
|
} |
|
|
|
cIterations = 10; |
|
cudaEventRecord( evStart ); |
|
for ( int i = 0; i < cIterations; i++ ) { |
|
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false ); |
|
} |
|
cudaEventRecord( evStop ); |
|
cuda(DeviceSynchronize() ); |
|
// make configurations that cannot launch error-out with 0 bandwidth |
|
cuda(GetLastError() ); |
|
cuda(EventElapsedTime( &ms, evStart, evStop ) ); |
|
elapsedTime = ms/1000.0f; |
|
|
|
// bytes per second |
|
ret = ((double)N*cIterations*sizeof(T)) / elapsedTime; |
|
// gigabytes per second |
|
ret /= 1024.0*1048576.0; |
|
|
|
Error: |
|
if ( hostIn ) delete[] hostIn; |
|
if ( hostOut ) delete[] hostOut; |
|
cudaEventDestroy( evStart ); |
|
cudaEventDestroy( evStop ); |
|
cudaFree( in ); |
|
cudaFree( out ); |
|
return ret; |
|
} |
|
|
|
template<class T, const int n, bool bOffset> |
|
double |
|
ReportRow( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks ) |
|
{ |
|
int maxThreads = 0; |
|
double maxBW = 0.0; |
|
printf( "%d\t", n ); |
|
for ( int cThreads = threadStart; cThreads <= threadStop; cThreads *= 2 ) { |
|
double bw = BandwidthReads<T,n,bOffset>( N, cBlocks, cThreads ); |
|
if ( bw > maxBW ) { |
|
maxBW = bw; |
|
maxThreads = cThreads; |
|
} |
|
printf( "%.2f\t", bw ); |
|
} |
|
printf( "%.2f\t%d\n", maxBW, maxThreads ); |
|
return maxBW; |
|
} |
|
|
|
template<class T, bool bCoalesced> |
|
void |
|
Shmoo( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks ) |
|
{ |
|
printf( "Operand size: %d byte%c\n", (int) sizeof(T), sizeof(T)==1 ? '\0' : 's' ); |
|
printf( "Input size: %dM operands\n", (int) (N>>20) ); |
|
printf( "Unroll\t" ); |
|
for ( int cThreads = threadStart; cThreads <= threadStop; cThreads *= 2 ) { |
|
printf( "%d\t", cThreads ); |
|
} |
|
printf( "maxBW\tmaxThreads\n" ); |
|
ReportRow<T, 1, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 2, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 3, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 4, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 5, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 6, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 7, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 8, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T, 9, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,10, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,11, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,12, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,13, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,14, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,15, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
ReportRow<T,16, ! bCoalesced>( N, threadStart, threadStop, cBlocks ); |
|
} |
|
|
|
|
|
int |
|
main( int argc, char *argv[] ) |
|
{ |
|
cudaError_t status; |
|
int device = 0; |
|
int size = 16; |
|
cudaDeviceProp prop; |
|
if ( chCommandLineGet( &device, "device", argc, argv ) ) { |
|
printf( "Using device %d...\n", device ); |
|
} |
|
cuda(SetDevice(device) ); |
|
cuda(GetDeviceProperties( &prop, device ) ); |
|
printf( "Running globalRead.cu microbenchmark on %s\n", prop.name ); |
|
if ( chCommandLineGet( &size, "size", argc, argv ) ) { |
|
printf( "Using %dM operands ...\n", size ); |
|
} |
|
|
|
if ( chCommandLineGetBool( "uncoalesced", argc, argv ) ) { |
|
printf( "Using uncoalesced memory transactions\n" ); |
|
Shmoo<char,false>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<short,false>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<int,false>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<myInt2,false>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<myInt4,false>( (size_t) size*1048576, 32, 512, 1500 ); |
|
} else { |
|
printf( "Using coalesced memory transactions\n" ); |
|
Shmoo<char,true>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<short,true>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<int,true>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<myInt2,true>( (size_t) size*1048576, 32, 512, 1500 ); |
|
Shmoo<myInt4,true>( (size_t) size*1048576, 32, 512, 1500 ); |
|
} |
|
return 0; |
|
Error: |
|
return 1; |
|
}
|
|
|