Source code that accompanies The CUDA Handbook.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

125 lines
3.8 KiB

/*
*
* divergence.cu
*
* Microdemo to measure performance implications of conditional code.
*
* Build with: nvcc [--gpu-architecture sm_xx] divergence.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2021, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <unistd.h>
#include <chError.h>
//
// parameters p and n are expected to account for grid structure
// apply threadIdx and/or blockIdx to p before passing in;
// apply blockDim and/or gridDim to n before passing in.
//
template<int base>
void __device__ sumInts( uint32_t *p, size_t N, size_t n )
{
for ( size_t i = 0; i < N; i++ ) {
*p += base;
p += n;
}
}
typedef void(*psumInts)(uint32_t *, size_t, size_t);
__device__ psumInts rgSumInts[] = {
sumInts< 0>, sumInts< 1>, sumInts< 2>, sumInts< 3>,
sumInts< 4>, sumInts< 5>, sumInts< 6>, sumInts< 7>,
sumInts< 8>, sumInts< 9>, sumInts<10>, sumInts<11>,
sumInts<12>, sumInts<13>, sumInts<14>, sumInts<15>,
sumInts<16>, sumInts<17>, sumInts<18>, sumInts<19>,
sumInts<20>, sumInts<21>, sumInts<22>, sumInts<23>,
sumInts<24>, sumInts<25>, sumInts<26>, sumInts<27>,
sumInts<28>, sumInts<29>, sumInts<30>, sumInts<31> };
template<uint32_t sh>
__global__ void
sumInts_bythread( uint32_t *p, size_t N )
{
uint32_t warpish_id = threadIdx.x>>sh;
N /= blockDim.x*gridDim.x;
rgSumInts[warpish_id&31]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
}
template<uint32_t sh>
static double
timeByThreads( uint32_t *p, size_t N )
{
cudaError_t status;
float elapsed_time;
double ret = 0.0;
cudaEvent_t start = 0, stop = 0;
cuda(EventCreate( &start ));
cuda(EventCreate( &stop ));
cuda(EventRecord( start ));
sumInts_bythread<sh><<<3072,1024>>>( p, N );
cuda(EventRecord( stop ));
cuda(DeviceSynchronize());
cuda(EventElapsedTime( &elapsed_time, start, stop ));
ret = N*1000.0/elapsed_time/1e9;
printf( "%2d threads: %f Gops/s\n", 1<<sh, ret );
Error:
cudaEventDestroy( stop );
cudaEventDestroy( start );
return ret;
}
int
main()
{
cudaError_t status;
size_t N = 1024*1024*1024UL;
uint32_t *p = 0;
cuda(Malloc( (void **) &p, N*sizeof(uint32_t)) );
cuda(Memset( p, 0, N*sizeof(uint32_t)) );
timeByThreads<6>( p, N );
timeByThreads<5>( p, N );
timeByThreads<4>( p, N );
timeByThreads<3>( p, N );
timeByThreads<2>( p, N );
timeByThreads<1>( p, N );
timeByThreads<0>( p, N );
cudaFree( p );
return 0;
Error:
return 1;
}