Browse Source

Initial cut, streaming samples

pull/1/head
Nicholas Wilt 13 years ago
parent
commit
012ee91830
  1. 192
      streaming/stream1Async.cu
  2. 206
      streaming/stream2Streams.cu
  3. 162
      streaming/stream3Mapped.cu
  4. 164
      streaming/stream4MappedXfer.cu

192
streaming/stream1Async.cu

@ -0,0 +1,192 @@ @@ -0,0 +1,192 @@
/*
*
* stream1Async.cu
*
* Microbenchmark to illustrate a bandwidth-limited workload.
*
* It separately measures the host->device transfer time, kernel
* processing time, and device->host transfer time. Due to low
* arithmetic density in the saxpy() kernel, the bulk of time
* is spent transferring data.
*
* Build with: nvcc -I ../chLib stream1Async.cu
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <chError.h>
#include <chCommandLine.h>
#include <stdio.h>
#include <stdlib.h>
//
// saxpy global function adds in[i]*alpha to each element out[i]
//
// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
//
__global__ void
saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] += alpha*x[i]+y[i];
}
}
void
saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = 0; i < N; i++ ) {
out[i] += alpha*x[i]+y[i];
}
}
cudaError_t
MeasureTimes(
float *msTotal,
float *msHtoD,
float *msKernel,
float *msDtoH,
size_t N,
float alpha,
int nBlocks,
int nThreads )
{
cudaError_t status;
float *dptrOut = 0, *hptrOut = 0;
float *dptrY = 0, *hptrY = 0;
float *dptrX = 0, *hptrX = 0;
cudaEvent_t evStart = 0;
cudaEvent_t evHtoD = 0;
cudaEvent_t evKernel = 0;
cudaEvent_t evDtoH = 0;
CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) );
memset( hptrOut, 0, N*sizeof(float) );
CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) );
CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) );
CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evHtoD ) );
CUDART_CHECK( cudaEventCreate( &evKernel ) );
CUDART_CHECK( cudaEventCreate( &evDtoH ) );
for ( size_t i = 0; i < N; i++ ) {
hptrX[i] = (float) rand() / RAND_MAX;
hptrY[i] = (float) rand() / RAND_MAX;
}
CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
CUDART_CHECK( cudaMemcpyAsync( dptrX, hptrX, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) );
CUDART_CHECK( cudaMemcpyAsync( dptrY, hptrY, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) );
CUDART_CHECK( cudaEventRecord( evHtoD, 0 ) );
saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
CUDART_CHECK( cudaEventRecord( evKernel, 0 ) );
CUDART_CHECK( cudaMemcpyAsync( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost, NULL ) );
CUDART_CHECK( cudaEventRecord( evDtoH, 0 ) );
CUDART_CHECK( cudaDeviceSynchronize() );
for ( size_t i = 0; i < N; i++ ) {
if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
status = cudaErrorUnknown;
goto Error;
}
}
CUDART_CHECK( cudaEventElapsedTime( msHtoD, evStart, evHtoD ) );
CUDART_CHECK( cudaEventElapsedTime( msKernel, evHtoD, evKernel ) );
CUDART_CHECK( cudaEventElapsedTime( msDtoH, evKernel, evDtoH ) );
CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evDtoH ) );
Error:
cudaEventDestroy( evDtoH );
cudaEventDestroy( evKernel );
cudaEventDestroy( evHtoD );
cudaEventDestroy( evStart );
cudaFree( dptrOut );
cudaFree( dptrX );
cudaFree( dptrY );
cudaFreeHost( hptrX );
cudaFreeHost( hptrY );
return status;
}
double
Bandwidth( float ms, double NumBytes )
{
return NumBytes / (1000.0*ms);
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int N_Mfloats = 128;
size_t N;
int nBlocks = 1500;
int nThreads = 256;
float alpha = 2.0f;
chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
chCommandLineGet( &nThreads, "nThreads", argc, argv );
chCommandLineGet( &N_Mfloats, "N", argc, argv );
printf( "Measuring times with %dM floats", N_Mfloats );
if ( N_Mfloats==128 ) {
printf( " (use --N to specify number of Mfloats)");
}
printf( "\n" );
N = 1048576*N_Mfloats;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
{
float msTotal, msHtoD, msKernel, msDtoH;
CUDART_CHECK( MeasureTimes( &msTotal, &msHtoD, &msKernel, &msDtoH, N, alpha, nBlocks, nThreads ) );
printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
printf( "Memcpy( host->device ): %.2f ms (%.2f MB/s)\n", msHtoD, Bandwidth( msHtoD, 2*N*sizeof(float) ) );
printf( "Kernel processing : %.2f ms (%.2f MB/s)\n", msKernel, Bandwidth( msKernel, 3*N*sizeof(float) ) );
printf( "Memcpy (device->host ): %.2f ms (%.2f MB/s)\n", msDtoH, Bandwidth( msDtoH, N*sizeof(float) ) );
}
Error:
if ( status == cudaErrorMemoryAllocation ) {
printf( "Memory allocation failed\n" );
}
else if ( cudaSuccess != status ) {
printf( "Failed\n" );
}
return cudaSuccess != status;
}

206
streaming/stream2Streams.cu

@ -0,0 +1,206 @@ @@ -0,0 +1,206 @@
/*
*
* stream2Streams.cu
*
* Formulation of stream1Async.cu that uses streams to overlap data
* transfers and kernel processing.
*
* Build with: nvcc -I ../chLib stream2Streams.cu
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <chError.h>
#include <chCommandLine.h>
#include <stdio.h>
#include <stdlib.h>
//
// saxpy global function adds in[i]*alpha to each element out[i]
//
// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
//
__global__ void
saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] += alpha*x[i]+y[i];
}
}
void
saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = 0; i < N; i++ ) {
out[i] += alpha*x[i]+y[i];
}
}
cudaError_t
MeasureTimes(
float *msTotal,
size_t N,
float alpha,
int nStreams,
int nBlocks,
int nThreads )
{
cudaError_t status;
float *dptrOut = 0, *hptrOut = 0;
float *dptrY = 0, *hptrY = 0;
float *dptrX = 0, *hptrX = 0;
cudaStream_t *streams = 0;
cudaEvent_t evStart = 0;
cudaEvent_t evStop = 0;
size_t streamStep = N / nStreams;
if ( N % nStreams ) {
printf( "Stream count must be evenly divisible into N\n" );
status = cudaErrorInvalidValue;
goto Error;
}
streams = new cudaStream_t[nStreams];
if ( ! streams ) {
status = cudaErrorMemoryAllocation;
goto Error;
}
memset( streams, 0, nStreams*sizeof(cudaStream_t) );
for ( int i = 0; i < nStreams; i++ ) {
CUDART_CHECK( cudaStreamCreate( &streams[i] ) );
}
CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) );
memset( hptrOut, 0, N*sizeof(float) );
CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) );
CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) );
CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
for ( size_t i = 0; i < N; i++ ) {
hptrX[i] = (float) rand() / RAND_MAX;
hptrY[i] = (float) rand() / RAND_MAX;
}
CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
for ( int iStream = 0; iStream < nStreams; iStream++ ) {
CUDART_CHECK( cudaMemcpyAsync( dptrX+iStream*streamStep, hptrX+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) );
CUDART_CHECK( cudaMemcpyAsync( dptrY+iStream*streamStep, hptrY+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) );
}
for ( int iStream = 0; iStream < nStreams; iStream++ ) {
saxpy<<<nBlocks, nThreads, 0, streams[iStream]>>>( dptrOut+iStream*streamStep, dptrX+iStream*streamStep, dptrY+iStream*streamStep, streamStep, alpha );
}
for ( int iStream = 0; iStream < nStreams; iStream++ ) {
CUDART_CHECK( cudaMemcpyAsync( hptrOut+iStream*streamStep, dptrOut+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyDeviceToHost, streams[iStream] ) );
}
CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
CUDART_CHECK( cudaDeviceSynchronize() );
for ( size_t i = 0; i < N; i++ ) {
if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
status = cudaErrorUnknown;
goto Error;
}
}
CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
Error:
if ( streams ) {
for ( int i = 0; i < nStreams; i++ ) {
cudaStreamDestroy( streams[i] );
}
delete[] streams;
}
cudaEventDestroy( evStart );
cudaEventDestroy( evStop );
cudaFree( dptrX );
cudaFree( dptrY );
cudaFreeHost( hptrX );
cudaFreeHost( hptrY );
return status;
}
double
Bandwidth( float ms, double NumBytes )
{
return NumBytes / (1000.0*ms);
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int N_Mfloats = 128;
size_t N;
int nStreams = 8;
int nBlocks = 1500;
int nThreads = 256;
float alpha = 2.0f;
chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
chCommandLineGet( &nThreads, "nThreads", argc, argv );
chCommandLineGet( &nThreads, "nStreams", argc, argv );
chCommandLineGet( &N_Mfloats, "N", argc, argv );
printf( "Measuring times with %dM floats", N_Mfloats );
if ( N_Mfloats==128 ) {
printf( " (use --N to specify number of Mfloats)");
}
printf( "\n" );
N = 1048576*N_Mfloats;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
{
float msTotal;
CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nStreams, nBlocks, nThreads ) );
printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
}
Error:
if ( status == cudaErrorMemoryAllocation ) {
printf( "Memory allocation failed\n" );
}
else if ( cudaSuccess != status ) {
printf( "Failed\n" );
}
return cudaSuccess != status;
}

162
streaming/stream3Mapped.cu

@ -0,0 +1,162 @@ @@ -0,0 +1,162 @@
/*
*
* stream3Mapped.cu
*
* Formulation of stream1Async.cu that uses mapped pinned memory to
* hold the input and output data. Since the kernel can use mapped
* pinned memory to initiate DMA transfers across the bus, this
* version is simpler (no cudaMemcpy() calls) and just as fast.
*
* Build with: nvcc -I ../chLib stream3Mapped.cu
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <chError.h>
#include <chCommandLine.h>
#include <stdio.h>
#include <stdlib.h>
//
// saxpy global function adds in[i]*alpha to each element out[i]
//
// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
//
__global__ void
saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] += alpha*x[i]+y[i];
}
}
void
saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = 0; i < N; i++ ) {
out[i] += alpha*x[i]+y[i];
}
}
cudaError_t
MeasureTimes(
float *msTotal,
size_t N,
float alpha,
int nBlocks,
int nThreads )
{
cudaError_t status;
float *dptrOut = 0, *hptrOut = 0;
float *dptrY = 0, *hptrY = 0;
float *dptrX = 0, *hptrX = 0;
cudaEvent_t evStart = 0;
cudaEvent_t evStop = 0;
CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) );
memset( hptrOut, 0, N*sizeof(float) );
CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) );
CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
for ( size_t i = 0; i < N; i++ ) {
hptrX[i] = (float) rand() / RAND_MAX;
hptrY[i] = (float) rand() / RAND_MAX;
}
CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
CUDART_CHECK( cudaDeviceSynchronize() );
for ( size_t i = 0; i < N; i++ ) {
if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
status = cudaErrorUnknown;
goto Error;
}
}
CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
Error:
cudaEventDestroy( evStop );
cudaEventDestroy( evStart );
cudaFreeHost( hptrOut );
cudaFreeHost( hptrX );
cudaFreeHost( hptrY );
return status;
}
double
Bandwidth( float ms, double NumBytes )
{
return NumBytes / (1000.0*ms);
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int N_Mfloats = 128;
size_t N;
int nBlocks = 1500;
int nThreads = 256;
float alpha = 2.0f;
chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
chCommandLineGet( &nThreads, "nThreads", argc, argv );
chCommandLineGet( &N_Mfloats, "N", argc, argv );
printf( "Measuring times with %dM floats", N_Mfloats );
if ( N_Mfloats==128 ) {
printf( " (use --N to specify number of Mfloats)");
}
printf( "\n" );
N = 1048576*N_Mfloats;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
{
float msTotal;
CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) );
printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
}
Error:
if ( status == cudaErrorMemoryAllocation ) {
printf( "Memory allocation failed\n" );
}
else if ( cudaSuccess != status ) {
printf( "Failed\n" );
}
return cudaSuccess != status;
}

164
streaming/stream4MappedXfer.cu

@ -0,0 +1,164 @@ @@ -0,0 +1,164 @@
/*
*
* stream4MappedXfer.cu
*
* Formulation of stream1Async.cu that uses mapped pinned memory to
* perform a computation on inputs in host memory while writing the
* output to device memory.
*
* Build with: nvcc -I ../chLib stream4MappedXfer.cu
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <chError.h>
#include <chCommandLine.h>
#include <stdio.h>
#include <stdlib.h>
//
// saxpy global function adds in[i]*alpha to each element out[i]
//
// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
//
__global__ void
saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] += alpha*x[i]+y[i];
}
}
void
saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
{
for ( size_t i = 0; i < N; i++ ) {
out[i] += alpha*x[i]+y[i];
}
}
cudaError_t
MeasureTimes(
float *msTotal,
size_t N,
float alpha,
int nBlocks,
int nThreads )
{
cudaError_t status;
float *dptrOut = 0, *hptrOut = 0;
float *dptrY = 0, *hptrY = 0;
float *dptrX = 0, *hptrX = 0;
cudaEvent_t evStart = 0;
cudaEvent_t evStop = 0;
CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) );
memset( hptrOut, 0, N*sizeof(float) );
CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) );
CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) );
CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) );
CUDART_CHECK( cudaEventCreate( &evStart ) );
CUDART_CHECK( cudaEventCreate( &evStop ) );
for ( size_t i = 0; i < N; i++ ) {
hptrX[i] = (float) rand() / RAND_MAX;
hptrY[i] = (float) rand() / RAND_MAX;
}
CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
CUDART_CHECK( cudaMemcpy( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaDeviceSynchronize() );
for ( size_t i = 0; i < N; i++ ) {
if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
status = cudaErrorUnknown;
goto Error;
}
}
CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
Error:
cudaEventDestroy( evStop );
cudaEventDestroy( evStart );
cudaFreeHost( hptrOut );
cudaFreeHost( hptrX );
cudaFreeHost( hptrY );
return status;
}
double
Bandwidth( float ms, double NumBytes )
{
return NumBytes / (1000.0*ms);
}
int
main( int argc, char *argv[] )
{
cudaError_t status;
int N_Mfloats = 128;
size_t N;
int nBlocks = 1500;
int nThreads = 256;
float alpha = 2.0f;
chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
chCommandLineGet( &nThreads, "nThreads", argc, argv );
chCommandLineGet( &N_Mfloats, "N", argc, argv );
printf( "Measuring times with %dM floats", N_Mfloats );
if ( N_Mfloats==128 ) {
printf( " (use --N to specify number of Mfloats)");
}
printf( "\n" );
N = 1048576*N_Mfloats;
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
{
float msTotal;
CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) );
printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
}
Error:
if ( status == cudaErrorMemoryAllocation ) {
printf( "Memory allocation failed\n" );
}
else if ( cudaSuccess != status ) {
printf( "Failed\n" );
}
return cudaSuccess != status;
}
Loading…
Cancel
Save