Source code that accompanies The CUDA Handbook.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

394 lines
12 KiB

/*
*
* reductionTemplated.cu
*
* Microbenchmark/demo that uses templates to efficiently compute
* reductions for any data type.
*
* Build with: nvcc -I ../chLib --gpu-architecture sm_11 <options> reductionTemplated.cu
* Requires: SM 1.1, for global atomics.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <assert.h>
#include <chCommandLine.h>
#include <chTimer.h>
#include <chError.h>
void
forkPrint( FILE *out, const char *fmt, ... )
{
va_list arg;
va_start(arg, fmt);
vprintf( fmt, arg );
if ( out ) {
vfprintf( out, fmt, arg );
}
va_end(arg);
}
FILE *g_fileShmoo;
//
// From Reduction SDK sample:
//
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
//
template<class T>
struct SharedMemory
{
__device__ inline operator T*()
{
extern __shared__ int __smem[];
return (T*) (void *) __smem;
}
__device__ inline operator const T*() const
{
extern __shared__ int __smem[];
return (T*) (void *) __smem;
}
};
#include "reduction_Sumi.h"
#include "reduction_Sumi_isq.h"
#include "reduction_Sumf.h"
#include "reduction_Sumf_fsq.h"
#include "reduction1Templated.cuh"
#include "reduction2Templated.cuh"
#include "reduction3Templated.cuh"
#include "reduction4Templated.cuh"
typedef struct TimingResult_struct {
double us;
int fastestThreads;
} TimingResult;
double
chEventBandwidth( cudaEvent_t start, cudaEvent_t stop, double cBytes )
{
float ms;
if ( cudaSuccess != cudaEventElapsedTime( &ms, start, stop ) )
return 0.0;
return cBytes * 1000.0f / ms;
}
typedef void (*pfnReduction)(CReduction_Sumi *out, CReduction_Sumi *partialSums, const int *in, size_t N, int cBlocks, int cThreads);
template<typename ReductionType, typename T>
double
TimedReduction(
ReductionType *answer, const T *deviceIn, size_t N, int cBlocks, int cThreads,
void (*hostReduction)(ReductionType *out, ReductionType *partialSums, const T *in, size_t N, int cBlocks, int cThreads)
)
{
double ret = 0.0;
ReductionType *deviceAnswer = 0;
ReductionType *partialSums = 0;
float ms;
cudaEvent_t start = 0;
cudaEvent_t stop = 0;
cudaError_t status;
cuda(Malloc( &deviceAnswer, sizeof(ReductionType) ) );
cuda(Malloc( &partialSums, cBlocks*sizeof(ReductionType) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(DeviceSynchronize() );
cuda(EventRecord( start, 0 ) );
hostReduction( deviceAnswer, partialSums, deviceIn, N, cBlocks, cThreads );
cuda(EventRecord( stop, 0 ) );
cuda(Memcpy( answer, deviceAnswer, sizeof(T), cudaMemcpyDeviceToHost ) );
cuda(EventElapsedTime( &ms, start, stop ) )
ret = ms * 1000.0f;
// fall through to free resources before returning
Error:
cudaFree( deviceAnswer );
cudaFree( partialSums );
cudaEventDestroy( start );
cudaEventDestroy( stop );
return ret;
}
int g_cBlocks = 900;
int g_ShmooThreadMin = 32;
int g_ShmooThreadMax = 512;
template<typename ReductionType, typename T>
void
Shmoo( TimingResult *timingResult,
T *deviceData, size_t cInts, const ReductionType& expectedSum,
void (*pfnReduce)(ReductionType *out, ReductionType *partialSums,
const T *in, size_t N, int cBlocks, int cThreads) )
{
double minTime = 1e6;
int fastestThreads = 0;
for ( int cThreads = g_ShmooThreadMin;
cThreads <= g_ShmooThreadMax;
cThreads *= 2 ) {
printf( "%d\t", cThreads );
}
printf( "minTC\tminTime\n" );
for ( int cThreads = g_ShmooThreadMin;
cThreads <= g_ShmooThreadMax;
cThreads *= 2 ) {
ReductionType sum;
double us = TimedReduction( &sum, deviceData, cInts, g_cBlocks, cThreads, pfnReduce );
if( sum != expectedSum ) {
printf( "Sum is wrong: 0x%08x should be 0x%08x\n", sum.sum, expectedSum.sum );
exit(1);
}
if ( us < minTime ) {
minTime = us;
fastestThreads = cThreads;
}
printf( "%.2f\t", us );
if ( cThreads == g_ShmooThreadMax ) {
printf( "%d\t%.2f\n", fastestThreads, minTime );
}
}
timingResult->fastestThreads = fastestThreads;
timingResult->us = minTime;
}
template<typename ReductionType, typename T>
double
usPerInvocation( int cIterations, size_t N,
void (*pfnReduction)( ReductionType *out, ReductionType *partial,
const T *in, size_t N, int numBlocks, int numThreads ) )
{
cudaError_t status;
T *smallArray = 0;
ReductionType *partialSums = 0;
double ret = 0.0f;
chTimerTimestamp start, stop;
cuda(Malloc( &smallArray, N*sizeof(T) ) );
cuda(Malloc( &partialSums, 1*sizeof(ReductionType) ) );
chTimerGetTime( &start );
for ( int i = 0; i < cIterations; i++ ) {
pfnReduction( partialSums, partialSums, smallArray, N, 1, 256 );
}
cuda(DeviceSynchronize() );
chTimerGetTime( &stop );
ret = chTimerElapsedTime( &start, &stop );
ret = (ret / (double) cIterations) * 1e6;
Error:
(void) cudaFree( partialSums );
(void) cudaFree( smallArray );
return ret;
}
template<typename ReductionType, typename T>
void
ReportOverhead()
{
printf( "Microseconds per reduction operation:\n" );
const size_t N = 1;
printf( "\tReduction1: %.2f\n", usPerInvocation<ReductionType, T>( 100000, N, Reduction1 ) );
printf( "\tReduction2: %.2f\n", usPerInvocation<ReductionType, T>( 100000, N, Reduction2 ) );
printf( "\tReduction3: %.2f\n", usPerInvocation<ReductionType, T>( 100000, N, Reduction3 ) );
printf( "\tReduction4: %.2f\n", usPerInvocation<ReductionType, T>( 100000, N, Reduction4 ) );
}
template<typename ReductionType, typename T>
bool
ShmooReport( size_t N, bool bFloat )
{
cudaError_t status;
cudaDeviceProp props;
bool ret = false;
T *hostData = 0;
T *deviceData = 0;
forkPrint( g_fileShmoo, "Shmoo: %d %ss\n", (int) N,
(bFloat) ? "float" : "int" );
hostData = (T *) malloc( N*sizeof(T) );
if ( ! hostData )
goto Error;
cuda(SetDeviceFlags( cudaDeviceMapHost ) );
cuda(Malloc( &deviceData, N*sizeof(T) ) );
cuda(GetDeviceProperties( &props, 0 ) );
cuda(Memcpy( deviceData, hostData, N*sizeof(T), cudaMemcpyHostToDevice ) );
forkPrint( g_fileShmoo, "N\tThreads(1)\tus(1)\t"
"Threads(2)\tus(2)\t"
"Threads(3)\tus(3)\t"
"Threads(4)\tus(4)\n" );
{
TimingResult result[4];
void (*rgpfnReduction[4])(ReductionType *out,
ReductionType *partialSums,
const T *in, size_t N, int cBlocks, int cThreads) =
{ Reduction1<ReductionType, T>,
Reduction2<ReductionType, T>,
Reduction3<ReductionType, T>,
Reduction4<ReductionType, T> };
ReductionType sum;
forkPrint( g_fileShmoo, "%d\t", N );
for ( size_t i = 0; i < N; i++ ) {
sum += hostData[i];
}
for ( size_t i = 0; i < 4; i++ ) {
Shmoo( &result[i], deviceData, N, sum, rgpfnReduction[i] );
forkPrint( g_fileShmoo, "%d\t%.2f\t",
result[i].fastestThreads, result[i].us );
fflush(stdout);
}
forkPrint( g_fileShmoo, "\n" );
}
Error:
free( hostData );
cudaFree( deviceData );
return ret;
}
void
Usage()
{
printf( "Usage: reductionTemplated [options]\n" );
printf( " Options may include any of the following:\n" );
printf( " --device <device>] - specify device to run on\n" );
printf( " --float - perform analysis on float-valued reductions\n" );
printf( " --squares - perform analysis on sum+sum of squares reductions (i.e. more computation)\n" );
printf( " --overhead - instead of performing shmoo over range of input sizes, report kernel launch overhead\n" );
printf( " --size <N> - number of Melements (scaled by 1048576)\n" );
printf( " --blocks <N> - number of blocks to run analysis on\n" );
printf( " --threadMax <count> - maximum number of threads to run analysis on\n" );
printf( " --threadMin <count> - minimum number of threads to run analysis on\n" );
printf( " --shmooFile <filename> - name of file to which to dump shmoo reporting\n" );
}
int
main( int argc, char *argv[] )
{
int device = 0;
int iN = 16;
size_t N;
bool bDoFloats = false;
bool bDoSquares = false;
bool bDoOverhead = false;
if ( argc == 1 ) {
Usage();
exit(0);
}
if ( chCommandLineGet( &device, "device", argc, argv ) ) {
printf( "Using device %d...\n", device );
cudaSetDevice( device );
}
chCommandLineGet( &iN, "size", argc, argv );
chCommandLineGet( &g_cBlocks, "blocks", argc, argv );
chCommandLineGet( &g_ShmooThreadMin, "threadMin", argc, argv );
chCommandLineGet( &g_ShmooThreadMax, "threadMax", argc, argv );
if ( g_ShmooThreadMin < 32 || g_ShmooThreadMax < 32 ||
g_ShmooThreadMax > 512 || g_ShmooThreadMin > 512 ) {
fprintf( stderr, "threadMin/threadMax must be in range 32..512\n" );
exit(1);
}
if ( g_ShmooThreadMin > g_ShmooThreadMax ) {
fprintf( stderr, "threadMax must be greater than or equal to threadMin\n" );
exit(1);
}
if ( (g_ShmooThreadMin & (g_ShmooThreadMin-1)) ||
(g_ShmooThreadMax & (g_ShmooThreadMax-1)) ) {
fprintf( stderr, "threadMin and threadMax must be powers of 2\n" );
exit(1);
}
N = (size_t) iN * 1048576;
bDoFloats = chCommandLineGetBool( "float", argc, argv );
bDoSquares = chCommandLineGetBool( "squares", argc, argv );
bDoOverhead = chCommandLineGetBool( "overhead", argc, argv );
printf( "Threads: %d - %d\n", g_ShmooThreadMin, g_ShmooThreadMax );
printf( "Blocks: %d\n", g_cBlocks );
{
char *fname;
if ( chCommandLineGet( &fname, "shmooFile", argc, argv ) ) {
g_fileShmoo = fopen( fname, "wt" );
}
}
if ( bDoOverhead ) {
if ( bDoSquares ) {
ReportOverhead<CReduction_Sumi_isq, int>();
}
else {
ReportOverhead<CReduction_Sumi, int>();
}
exit(0);
}
if ( bDoFloats ) {
if ( bDoSquares ) {
ShmooReport<CReduction_Sumf_fsq, float>( N, true );
}
else {
ShmooReport<CReduction_Sumf, float>( N, true );
}
}
else {
if ( bDoSquares ) {
ShmooReport<CReduction_Sumi_isq, int>( N, false );
}
else {
ShmooReport<CReduction_Sumi, int>( N, false );
}
}
if ( g_fileShmoo ) {
fclose( g_fileShmoo );
}
}