Checkpoint divergence performance test

4 years ago · d71cd98c31
1 changed files with 112 additions and 0 deletions
--- a/SMs/divergence.cu
+++ b/SMs/divergence.cu
@ -0,0 +1,112 @@
 /*
 *
 * divergence.cu
 *
 * Microdemo to measure performance implications of conditional code.
 *
 * Build with: nvcc [--gpu-architecture sm_xx] [-D USE_FLOAT] [-D USE_IF_STATEMENT] divergence.cu
 * Requires: No minimum SM requirement.
 *
 * Copyright (c) 2021, Archaea Software, LLC.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright 
 *    notice, this list of conditions and the following disclaimer. 
 * 2. Redistributions in binary form must reproduce the above copyright 
 *    notice, this list of conditions and the following disclaimer in 
 *    the documentation and/or other materials provided with the 
 *    distribution. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */
 #include <stdio.h>
 #include <unistd.h>
 #include <chError.h>
 //
 // parameters p and n are expected to account for grid structure
 // apply threadIdx and/or blockIdx to p before passing in;
 // apply blockDim and/or gridDim to n before passing in.
 //
 template<int base>
 void __device__ sumFloats( float *p, size_t N, size_t n )
 {
    float f = base;
    for ( size_t i = 0; i < N; i++ ) {
        *p += f;
        p += n;
    }
 }
 typedef void(*psumFloats)(float *, size_t, size_t);
 __device__ psumFloats rgSumFloats[] = {
    sumFloats< 0>, sumFloats< 1>, sumFloats< 2>, sumFloats< 3>,
    sumFloats< 4>, sumFloats< 5>, sumFloats< 6>, sumFloats< 7>,
    sumFloats< 8>, sumFloats< 9>, sumFloats<10>, sumFloats<11>,
    sumFloats<12>, sumFloats<13>, sumFloats<14>, sumFloats<15>,
    sumFloats<16>, sumFloats<17>, sumFloats<18>, sumFloats<19>,
    sumFloats<20>, sumFloats<21>, sumFloats<22>, sumFloats<23>,
    sumFloats<24>, sumFloats<25>, sumFloats<26>, sumFloats<27>,
    sumFloats<28>, sumFloats<29>, sumFloats<30>, sumFloats<31> };
 __global__ void
 sumFloats_bywarp( float *p, size_t N )
 {
    uint32_t warpid = threadIdx.x>>5;
    N /= blockDim.x*gridDim.x;
    rgSumFloats[warpid]( p+threadIdx.x+blockIdx.x*blockDim.x, N, blockDim.x*gridDim.x );
 }
 __global__ void
 sumFloats_bythread( float *p, size_t N )
 {
 }
 int
 main()
 {
    cudaError_t status;
    size_t N = 1024*1024*1024UL;
    float *p = 0;
    float et;
    cudaEvent_t start = 0, stop = 0;
    cuda(Malloc( (void **) &p, N*sizeof(float)) );
    cuda(Memset( p, 0, N*sizeof(float)) );
    cuda(EventCreate( &start ));
    cuda(EventCreate( &stop ));
    cuda(EventRecord( start ));
    sumFloats_bywarp<<<3072,256>>>( p, N );
    cuda(EventRecord( stop ));
    cuda(DeviceSynchronize());
    cuda(EventElapsedTime( &et, start, stop ));
    printf( "%.2f ms = %.2f Gops/s\n", et, (double) N*1000.0/et/1e9 );
    cudaFree( p );
    cudaEventDestroy( stop );
    cudaEventDestroy( start );
    return 0;
 Error:
    return 1;
 }