Source code that accompanies The CUDA Handbook.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

180 lines
6.0 KiB

/*
*
* reportClocks.cu
*
* Reports how grid and block IDs are assigned by the hardware.
*
* Build with: nvcc -I ../chLib <options> reportClocks.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <chError.h>
__global__ void
WriteClockValues( unsigned int *completionTimes, unsigned int *threadIDs )
{
size_t globalBlock = blockIdx.x+blockDim.x*(blockIdx.y+blockDim.y*blockIdx.z);
size_t globalThread = threadIdx.x+blockDim.x*(threadIdx.y+blockDim.y*threadIdx.z);
size_t totalBlockSize = blockDim.x*blockDim.y*blockDim.z;
size_t globalIndex = globalBlock*totalBlockSize + globalThread;
completionTimes[globalIndex] = clock();
threadIDs[globalIndex] = threadIdx.y<<4|threadIdx.x;
}
void
WriteOutput( FILE *out, dim3 gridSize, dim3 blockSize, const unsigned int *hostOut )
{
int index = 0;
for ( int gridZ = 0; gridZ < gridSize.z; gridZ++ ) {
for ( int gridY = 0; gridY < gridSize.y; gridY++ ) {
for ( int gridX = 0; gridX < gridSize.x; gridX++ ) {
for ( int threadZ = 0; threadZ < blockSize.z; threadZ++ ) {
printf( "Grid (%d, %d, %d) - slice %d:\n", gridX, gridY, gridZ, threadZ );
for ( int threadY = 0; threadY < blockSize.y; threadY++ ) {
for ( int threadX = 0; threadX < blockSize.x; threadX++ ) {
fprintf( out, "%4x", hostOut[index] );
index++;
}
fprintf(out, "\n" );
}
}
}
}
}
}
bool
ReportTimesAndIDs( FILE *clocksFile, FILE *tidsFile, dim3 gridSize, dim3 blockSize )
{
cudaError_t status;
bool ret = false;
size_t totalBlockSize = blockSize.x*blockSize.y*blockSize.z;
size_t numTimes = totalBlockSize*gridSize.x*gridSize.y*gridSize.z;
cudaEvent_t start = 0;
cudaEvent_t stop = 0;
unsigned int *deviceClockValues = 0;
unsigned int *deviceThreadIDs = 0;
unsigned int *hostOut = 0;
cuda(Malloc( &deviceClockValues, numTimes*sizeof(int) ) );
cuda(Malloc( &deviceThreadIDs, numTimes*sizeof(int) ) );
cuda(MallocHost( &hostOut, numTimes*sizeof(int) ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
cuda(DeviceSynchronize() );
cuda(EventRecord( start, 0 ) );
WriteClockValues<<<gridSize, blockSize>>>( deviceClockValues, deviceThreadIDs );
cuda(EventRecord( stop, 0 ) );
cuda(DeviceSynchronize() );
{
float ms;
cuda(EventElapsedTime( &ms, start, stop ) );
printf( "%.2f ms for %d threads = %.2f us/thread\n", ms, (int) numTimes, ms*1e3/numTimes );
}
if ( clocksFile ) {
cuda(Memcpy( hostOut, deviceClockValues, numTimes*sizeof(int), cudaMemcpyDeviceToHost ) );
// turn clock values into completion times by subtracting minimum reported clock value
unsigned int minTime = ~(unsigned int) 0;
for ( size_t i = 0; i < numTimes; i++ ) {
if ( hostOut[i] < minTime ) {
minTime = hostOut[i];
}
}
for ( size_t i = 0; i < numTimes; i++ ) {
hostOut[i] -= minTime;
}
fprintf( clocksFile, "Completion times (clocks):\n" );
WriteOutput( clocksFile, gridSize, blockSize, hostOut );
}
if ( tidsFile ) {
cuda(Memcpy( hostOut, deviceThreadIDs, numTimes*sizeof(int), cudaMemcpyDeviceToHost ) );
fprintf( tidsFile, "Thread IDs:\n" );
WriteOutput( tidsFile, gridSize, blockSize, hostOut );
}
ret = true;
Error:
cudaFree( deviceClockValues );
cudaFree( deviceThreadIDs );
cudaFreeHost( hostOut );
cudaEventDestroy( start );
cudaEventDestroy( stop );
return ret;
}
int
main( int argc, char *argv[] )
{
dim3 blockSize;
dim3 gridSize;
blockSize.x = blockSize.y = blockSize.z = 1;
gridSize.x = gridSize.y = gridSize.z = 1;
gridSize.x = 2;
blockSize.x = 16;
blockSize.y = 8;
ReportTimesAndIDs( stdout, stdout, gridSize, blockSize );
blockSize.x = 14;
blockSize.y = 8;
ReportTimesAndIDs( stdout, stdout, gridSize, blockSize );
#if 0
for ( gridSize.x = 1; gridSize.x <= 8; gridSize.x++ ) {
blockSize.x = blockSize.y = 16;
ReportTimesAndIDs( stdout, NULL, gridSize, blockSize );
blockSize.x = 17;
blockSize.y = 16;
ReportTimesAndIDs( stdout, NULL, gridSize, blockSize );
blockSize.x = 18;
blockSize.y = 16;
ReportTimesAndIDs( stdout, NULL, gridSize, blockSize );
}
#endif
}