Source code that accompanies The CUDA Handbook.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

287 lines
9.2 KiB

/*
*
* globalRead.cu
*
* Microbenchmark for read bandwidth from global memory.
*
* Build with: nvcc -I ../chLib <options> globalRead.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <chError.h>
#include <chCommandLine.h>
template<class T>
__device__ __host__ T
plus( const T& a, const T& b )
{
T ret = a;
ret += b;
return ret;
}
struct myInt2 {
int2 i2;
__host__ __device__ myInt2() { }
__host__ __device__ myInt2( int i ) { i2.x = i2.y = i; }
};
template<>
__device__ __host__ myInt2
plus( const myInt2& a, const myInt2& b )
{
myInt2 ret;
ret.i2.x = a.i2.x + b.i2.x;
ret.i2.y = a.i2.y + b.i2.y;
return ret;
}
struct myInt4 {
int4 i4;
__host__ __device__ myInt4() { }
__host__ __device__ myInt4( int i ) { i4.x = i4.y = i4.z = i4.w = i; }
};
template<>
__device__ __host__ myInt4
plus( const myInt4& a, const myInt4& b )
{
myInt4 ret;
ret.i4.x = a.i4.x + b.i4.x;
ret.i4.y = a.i4.y + b.i4.y;
ret.i4.z = a.i4.z + b.i4.z;
ret.i4.w = a.i4.w + b.i4.w;
return ret;
}
template<class T, const int n>
__global__ void
GlobalReads( T *out, const T *in, size_t N, bool bWriteResults )
{
T sums[n];
size_t i;
for ( int j = 0; j < n; j++ ) {
sums[j] = T(0);
}
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
sums[j] = plus( sums[j], in[index] );
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) sums[j] = plus( sums[j], in[index] );
}
if ( bWriteResults ) {
T sum = T(0);
for ( int j = 0; j < n; j++ ) {
sum = plus( sum, sums[j] );
}
out[blockIdx.x*blockDim.x+threadIdx.x] = sum;
}
}
template<class T, const int n, bool bOffset>
double
BandwidthReads( size_t N, int cBlocks, int cThreads )
{
T *in = 0;
T *out = 0;
T *hostIn = 0;
T *hostOut = 0;
double ret = 0.0;
double elapsedTime;
float ms;
int cIterations;
cudaError_t status;
T sumCPU;
cudaEvent_t evStart = 0;
cudaEvent_t evStop = 0;
cuda(Malloc( &in, N*sizeof(T) ) );
cuda(Malloc( &out, cBlocks*cThreads*sizeof(T) ) );
hostIn = new T[N];
if ( ! hostIn )
goto Error;
hostOut = new T[cBlocks*cThreads];
if ( ! hostOut )
goto Error;
sumCPU = T(0);
// populate input array with random numbers
for ( size_t i = bOffset; i < N; i++ ) {
T nextrand = T(rand());
sumCPU = plus( sumCPU, nextrand );
hostIn[i] = nextrand;
}
cuda(Memcpy( in, hostIn, N*sizeof(T), cudaMemcpyHostToDevice ) );
cuda(EventCreate( &evStart ) );
cuda(EventCreate( &evStop ) );
{
// confirm that kernel launch with this configuration writes correct result
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, true );
cuda(Memcpy( hostOut, out, cBlocks*cThreads*sizeof(T), cudaMemcpyDeviceToHost ) );
cuda(GetLastError() );
T sumGPU = T(0);
for ( size_t i = 0; i < cBlocks*cThreads; i++ ) {
sumGPU = plus( sumGPU, hostOut[i] );
}
if ( memcmp( &sumCPU, &sumGPU, sizeof(T) ) ) {
printf( "Incorrect sum computed!\n" );
goto Error;
}
}
cIterations = 10;
cudaEventRecord( evStart );
for ( int i = 0; i < cIterations; i++ ) {
GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
}
cudaEventRecord( evStop );
cuda(DeviceSynchronize() );
// make configurations that cannot launch error-out with 0 bandwidth
cuda(GetLastError() );
cuda(EventElapsedTime( &ms, evStart, evStop ) );
elapsedTime = ms/1000.0f;
// bytes per second
ret = ((double)N*cIterations*sizeof(T)) / elapsedTime;
// gigabytes per second
ret /= 1024.0*1048576.0;
Error:
if ( hostIn ) delete[] hostIn;
if ( hostOut ) delete[] hostOut;
cudaEventDestroy( evStart );
cudaEventDestroy( evStop );
cudaFree( in );
cudaFree( out );
return ret;
}
template<class T, const int n, bool bOffset>
double
ReportRow( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks )
{
int maxThreads = 0;
double maxBW = 0.0;
printf( "%d\t", n );
for ( int cThreads = threadStart; cThreads <= threadStop; cThreads *= 2 ) {
double bw = BandwidthReads<T,n,bOffset>( N, cBlocks, cThreads );
if ( bw > maxBW ) {
maxBW = bw;
maxThreads = cThreads;
}
printf( "%.2f\t", bw );
}
printf( "%.2f\t%d\n", maxBW, maxThreads );
return maxBW;
}
template<class T, bool bCoalesced>
void
Shmoo( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks )
{
printf( "Operand size: %d byte%c\n", (int) sizeof(T), sizeof(T)==1 ? '\0' : 's' );
printf( "Input size: %dM operands\n", (int) (N>>20) );
printf( "Unroll\t" );
for ( int cThreads = threadStart; cThreads <= threadStop; cThreads *= 2 ) {
printf( "%d\t", cThreads );
}
printf( "maxBW\tmaxThreads\n" );
ReportRow<T, 1, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 2, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 3, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 4, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 5, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 6, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 7, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 8, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T, 9, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,10, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,11, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,12, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,13, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,14, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,15, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
ReportRow<T,16, ! bCoalesced>( N, threadStart, threadStop, cBlocks );
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int device = 0;
int size = 16;
cudaDeviceProp prop;
if ( chCommandLineGet( &device, "device", argc, argv ) ) {
printf( "Using device %d...\n", device );
}
cuda(SetDevice(device) );
cuda(GetDeviceProperties( &prop, device ) );
printf( "Running globalRead.cu microbenchmark on %s\n", prop.name );
if ( chCommandLineGet( &size, "size", argc, argv ) ) {
printf( "Using %dM operands ...\n", size );
}
if ( chCommandLineGetBool( "uncoalesced", argc, argv ) ) {
printf( "Using uncoalesced memory transactions\n" );
Shmoo<char,false>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<short,false>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<int,false>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<myInt2,false>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<myInt4,false>( (size_t) size*1048576, 32, 512, 1500 );
} else {
printf( "Using coalesced memory transactions\n" );
Shmoo<char,true>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<short,true>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<int,true>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<myInt2,true>( (size_t) size*1048576, 32, 512, 1500 );
Shmoo<myInt4,true>( (size_t) size*1048576, 32, 512, 1500 );
}
return 0;
Error:
return 1;
}