diff --git a/streaming/stream1Async.cu b/streaming/stream1Async.cu new file mode 100644 index 0000000..3537e3f --- /dev/null +++ b/streaming/stream1Async.cu @@ -0,0 +1,192 @@ +/* + * + * stream1Async.cu + * + * Microbenchmark to illustrate a bandwidth-limited workload. + * + * It separately measures the host->device transfer time, kernel + * processing time, and device->host transfer time. Due to low + * arithmetic density in the saxpy() kernel, the bulk of time + * is spent transferring data. + * + * Build with: nvcc -I ../chLib stream1Async.cu + * + * Copyright (c) 2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include +#include + +// +// saxpy global function adds in[i]*alpha to each element out[i] +// +// Due to low arithmetic density, this kernel is extremely bandwidth-bound. +// + +__global__ void +saxpy( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x; + i < N; + i += blockDim.x*gridDim.x ) { + out[i] += alpha*x[i]+y[i]; + } +} + +void +saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = 0; i < N; i++ ) { + out[i] += alpha*x[i]+y[i]; + } +} + +cudaError_t +MeasureTimes( + float *msTotal, + float *msHtoD, + float *msKernel, + float *msDtoH, + size_t N, + float alpha, + int nBlocks, + int nThreads ) +{ + cudaError_t status; + float *dptrOut = 0, *hptrOut = 0; + float *dptrY = 0, *hptrY = 0; + float *dptrX = 0, *hptrX = 0; + cudaEvent_t evStart = 0; + cudaEvent_t evHtoD = 0; + cudaEvent_t evKernel = 0; + cudaEvent_t evDtoH = 0; + + CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) ); + memset( hptrOut, 0, N*sizeof(float) ); + CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) ); + CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) ); + + CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaEventCreate( &evStart ) ); + CUDART_CHECK( cudaEventCreate( &evHtoD ) ); + CUDART_CHECK( cudaEventCreate( &evKernel ) ); + CUDART_CHECK( cudaEventCreate( &evDtoH ) ); + for ( size_t i = 0; i < N; i++ ) { + hptrX[i] = (float) rand() / RAND_MAX; + hptrY[i] = (float) rand() / RAND_MAX; + } + CUDART_CHECK( cudaEventRecord( evStart, 0 ) ); + CUDART_CHECK( cudaMemcpyAsync( dptrX, hptrX, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) ); + CUDART_CHECK( cudaMemcpyAsync( dptrY, hptrY, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) ); + CUDART_CHECK( cudaEventRecord( evHtoD, 0 ) ); + saxpy<<>>( dptrOut, dptrX, dptrY, N, alpha ); + CUDART_CHECK( cudaEventRecord( evKernel, 0 ) ); + CUDART_CHECK( cudaMemcpyAsync( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost, NULL ) ); + CUDART_CHECK( cudaEventRecord( evDtoH, 0 ) ); + CUDART_CHECK( cudaDeviceSynchronize() ); + for ( size_t i = 0; i < N; i++ ) { + if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) { + status = cudaErrorUnknown; + goto Error; + } + } + CUDART_CHECK( cudaEventElapsedTime( msHtoD, evStart, evHtoD ) ); + CUDART_CHECK( cudaEventElapsedTime( msKernel, evHtoD, evKernel ) ); + CUDART_CHECK( cudaEventElapsedTime( msDtoH, evKernel, evDtoH ) ); + CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evDtoH ) ); +Error: + cudaEventDestroy( evDtoH ); + cudaEventDestroy( evKernel ); + cudaEventDestroy( evHtoD ); + cudaEventDestroy( evStart ); + cudaFree( dptrOut ); + cudaFree( dptrX ); + cudaFree( dptrY ); + cudaFreeHost( hptrX ); + cudaFreeHost( hptrY ); + return status; +} + +double +Bandwidth( float ms, double NumBytes ) +{ + return NumBytes / (1000.0*ms); +} + +int +main( int argc, char *argv[] ) +{ + cudaError_t status; + int N_Mfloats = 128; + size_t N; + int nBlocks = 1500; + int nThreads = 256; + float alpha = 2.0f; + + chCommandLineGet( &nBlocks, "nBlocks", argc, argv ); + chCommandLineGet( &nThreads, "nThreads", argc, argv ); + chCommandLineGet( &N_Mfloats, "N", argc, argv ); + printf( "Measuring times with %dM floats", N_Mfloats ); + if ( N_Mfloats==128 ) { + printf( " (use --N to specify number of Mfloats)"); + } + printf( "\n" ); + + N = 1048576*N_Mfloats; + + CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); + { + float msTotal, msHtoD, msKernel, msDtoH; + CUDART_CHECK( MeasureTimes( &msTotal, &msHtoD, &msKernel, &msDtoH, N, alpha, nBlocks, nThreads ) ); + printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) ); + printf( "Memcpy( host->device ): %.2f ms (%.2f MB/s)\n", msHtoD, Bandwidth( msHtoD, 2*N*sizeof(float) ) ); + printf( "Kernel processing : %.2f ms (%.2f MB/s)\n", msKernel, Bandwidth( msKernel, 3*N*sizeof(float) ) ); + printf( "Memcpy (device->host ): %.2f ms (%.2f MB/s)\n", msDtoH, Bandwidth( msDtoH, N*sizeof(float) ) ); + } + +Error: + if ( status == cudaErrorMemoryAllocation ) { + printf( "Memory allocation failed\n" ); + } + else if ( cudaSuccess != status ) { + printf( "Failed\n" ); + } + return cudaSuccess != status; +} diff --git a/streaming/stream2Streams.cu b/streaming/stream2Streams.cu new file mode 100644 index 0000000..cd771f9 --- /dev/null +++ b/streaming/stream2Streams.cu @@ -0,0 +1,206 @@ +/* + * + * stream2Streams.cu + * + * Formulation of stream1Async.cu that uses streams to overlap data + * transfers and kernel processing. + * + * Build with: nvcc -I ../chLib stream2Streams.cu + * + * Copyright (c) 2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include +#include + +// +// saxpy global function adds in[i]*alpha to each element out[i] +// +// Due to low arithmetic density, this kernel is extremely bandwidth-bound. +// + +__global__ void +saxpy( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x; + i < N; + i += blockDim.x*gridDim.x ) { + out[i] += alpha*x[i]+y[i]; + } +} + +void +saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = 0; i < N; i++ ) { + out[i] += alpha*x[i]+y[i]; + } +} + +cudaError_t +MeasureTimes( + float *msTotal, + size_t N, + float alpha, + int nStreams, + int nBlocks, + int nThreads ) +{ + cudaError_t status; + float *dptrOut = 0, *hptrOut = 0; + float *dptrY = 0, *hptrY = 0; + float *dptrX = 0, *hptrX = 0; + cudaStream_t *streams = 0; + cudaEvent_t evStart = 0; + cudaEvent_t evStop = 0; + size_t streamStep = N / nStreams; + + if ( N % nStreams ) { + printf( "Stream count must be evenly divisible into N\n" ); + status = cudaErrorInvalidValue; + goto Error; + } + + streams = new cudaStream_t[nStreams]; + if ( ! streams ) { + status = cudaErrorMemoryAllocation; + goto Error; + } + memset( streams, 0, nStreams*sizeof(cudaStream_t) ); + for ( int i = 0; i < nStreams; i++ ) { + CUDART_CHECK( cudaStreamCreate( &streams[i] ) ); + } + CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) ); + memset( hptrOut, 0, N*sizeof(float) ); + CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) ); + CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) ); + + CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) ); + + CUDART_CHECK( cudaEventCreate( &evStart ) ); + CUDART_CHECK( cudaEventCreate( &evStop ) ); + for ( size_t i = 0; i < N; i++ ) { + hptrX[i] = (float) rand() / RAND_MAX; + hptrY[i] = (float) rand() / RAND_MAX; + } + CUDART_CHECK( cudaEventRecord( evStart, 0 ) ); + + for ( int iStream = 0; iStream < nStreams; iStream++ ) { + CUDART_CHECK( cudaMemcpyAsync( dptrX+iStream*streamStep, hptrX+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) ); + CUDART_CHECK( cudaMemcpyAsync( dptrY+iStream*streamStep, hptrY+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) ); + } + + for ( int iStream = 0; iStream < nStreams; iStream++ ) { + saxpy<<>>( dptrOut+iStream*streamStep, dptrX+iStream*streamStep, dptrY+iStream*streamStep, streamStep, alpha ); + } + + for ( int iStream = 0; iStream < nStreams; iStream++ ) { + CUDART_CHECK( cudaMemcpyAsync( hptrOut+iStream*streamStep, dptrOut+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyDeviceToHost, streams[iStream] ) ); + } + + CUDART_CHECK( cudaEventRecord( evStop, 0 ) ); + CUDART_CHECK( cudaDeviceSynchronize() ); + for ( size_t i = 0; i < N; i++ ) { + if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) { + status = cudaErrorUnknown; + goto Error; + } + } + CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) ); +Error: + if ( streams ) { + for ( int i = 0; i < nStreams; i++ ) { + cudaStreamDestroy( streams[i] ); + } + delete[] streams; + } + cudaEventDestroy( evStart ); + cudaEventDestroy( evStop ); + cudaFree( dptrX ); + cudaFree( dptrY ); + cudaFreeHost( hptrX ); + cudaFreeHost( hptrY ); + return status; +} + +double +Bandwidth( float ms, double NumBytes ) +{ + return NumBytes / (1000.0*ms); +} + +int +main( int argc, char *argv[] ) +{ + cudaError_t status; + int N_Mfloats = 128; + size_t N; + int nStreams = 8; + int nBlocks = 1500; + int nThreads = 256; + float alpha = 2.0f; + + chCommandLineGet( &nBlocks, "nBlocks", argc, argv ); + chCommandLineGet( &nThreads, "nThreads", argc, argv ); + chCommandLineGet( &nThreads, "nStreams", argc, argv ); + chCommandLineGet( &N_Mfloats, "N", argc, argv ); + printf( "Measuring times with %dM floats", N_Mfloats ); + if ( N_Mfloats==128 ) { + printf( " (use --N to specify number of Mfloats)"); + } + printf( "\n" ); + + N = 1048576*N_Mfloats; + + CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); + { + float msTotal; + CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nStreams, nBlocks, nThreads ) ); + printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) ); + } + +Error: + if ( status == cudaErrorMemoryAllocation ) { + printf( "Memory allocation failed\n" ); + } + else if ( cudaSuccess != status ) { + printf( "Failed\n" ); + } + return cudaSuccess != status; +} diff --git a/streaming/stream3Mapped.cu b/streaming/stream3Mapped.cu new file mode 100644 index 0000000..7a2e681 --- /dev/null +++ b/streaming/stream3Mapped.cu @@ -0,0 +1,162 @@ +/* + * + * stream3Mapped.cu + * + * Formulation of stream1Async.cu that uses mapped pinned memory to + * hold the input and output data. Since the kernel can use mapped + * pinned memory to initiate DMA transfers across the bus, this + * version is simpler (no cudaMemcpy() calls) and just as fast. + * + * Build with: nvcc -I ../chLib stream3Mapped.cu + * + * Copyright (c) 2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include +#include + +// +// saxpy global function adds in[i]*alpha to each element out[i] +// +// Due to low arithmetic density, this kernel is extremely bandwidth-bound. +// + +__global__ void +saxpy( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x; + i < N; + i += blockDim.x*gridDim.x ) { + out[i] += alpha*x[i]+y[i]; + } +} + +void +saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = 0; i < N; i++ ) { + out[i] += alpha*x[i]+y[i]; + } +} + +cudaError_t +MeasureTimes( + float *msTotal, + size_t N, + float alpha, + int nBlocks, + int nThreads ) +{ + cudaError_t status; + float *dptrOut = 0, *hptrOut = 0; + float *dptrY = 0, *hptrY = 0; + float *dptrX = 0, *hptrX = 0; + cudaEvent_t evStart = 0; + cudaEvent_t evStop = 0; + + CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) ); + memset( hptrOut, 0, N*sizeof(float) ); + CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) ); + CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) ); + + CUDART_CHECK( cudaEventCreate( &evStart ) ); + CUDART_CHECK( cudaEventCreate( &evStop ) ); + for ( size_t i = 0; i < N; i++ ) { + hptrX[i] = (float) rand() / RAND_MAX; + hptrY[i] = (float) rand() / RAND_MAX; + } + CUDART_CHECK( cudaEventRecord( evStart, 0 ) ); + saxpy<<>>( dptrOut, dptrX, dptrY, N, alpha ); + CUDART_CHECK( cudaEventRecord( evStop, 0 ) ); + CUDART_CHECK( cudaDeviceSynchronize() ); + for ( size_t i = 0; i < N; i++ ) { + if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) { + status = cudaErrorUnknown; + goto Error; + } + } + CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) ); +Error: + cudaEventDestroy( evStop ); + cudaEventDestroy( evStart ); + cudaFreeHost( hptrOut ); + cudaFreeHost( hptrX ); + cudaFreeHost( hptrY ); + return status; +} + +double +Bandwidth( float ms, double NumBytes ) +{ + return NumBytes / (1000.0*ms); +} + +int +main( int argc, char *argv[] ) +{ + cudaError_t status; + int N_Mfloats = 128; + size_t N; + int nBlocks = 1500; + int nThreads = 256; + float alpha = 2.0f; + + chCommandLineGet( &nBlocks, "nBlocks", argc, argv ); + chCommandLineGet( &nThreads, "nThreads", argc, argv ); + chCommandLineGet( &N_Mfloats, "N", argc, argv ); + printf( "Measuring times with %dM floats", N_Mfloats ); + if ( N_Mfloats==128 ) { + printf( " (use --N to specify number of Mfloats)"); + } + printf( "\n" ); + + N = 1048576*N_Mfloats; + + CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); + { + float msTotal; + CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) ); + printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) ); + } + +Error: + if ( status == cudaErrorMemoryAllocation ) { + printf( "Memory allocation failed\n" ); + } + else if ( cudaSuccess != status ) { + printf( "Failed\n" ); + } + return cudaSuccess != status; +} diff --git a/streaming/stream4MappedXfer.cu b/streaming/stream4MappedXfer.cu new file mode 100644 index 0000000..8d86a9d --- /dev/null +++ b/streaming/stream4MappedXfer.cu @@ -0,0 +1,164 @@ +/* + * + * stream4MappedXfer.cu + * + * Formulation of stream1Async.cu that uses mapped pinned memory to + * perform a computation on inputs in host memory while writing the + * output to device memory. + * + * Build with: nvcc -I ../chLib stream4MappedXfer.cu + * + * Copyright (c) 2012, Archaea Software, LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include +#include + +// +// saxpy global function adds in[i]*alpha to each element out[i] +// +// Due to low arithmetic density, this kernel is extremely bandwidth-bound. +// + +__global__ void +saxpy( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x; + i < N; + i += blockDim.x*gridDim.x ) { + out[i] += alpha*x[i]+y[i]; + } +} + +void +saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha ) +{ + for ( size_t i = 0; i < N; i++ ) { + out[i] += alpha*x[i]+y[i]; + } +} + +cudaError_t +MeasureTimes( + float *msTotal, + size_t N, + float alpha, + int nBlocks, + int nThreads ) +{ + cudaError_t status; + float *dptrOut = 0, *hptrOut = 0; + float *dptrY = 0, *hptrY = 0; + float *dptrX = 0, *hptrX = 0; + cudaEvent_t evStart = 0; + cudaEvent_t evStop = 0; + + CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) ); + CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) ); + memset( hptrOut, 0, N*sizeof(float) ); + CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) ); + CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) ); + CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) ); + + CUDART_CHECK( cudaEventCreate( &evStart ) ); + CUDART_CHECK( cudaEventCreate( &evStop ) ); + for ( size_t i = 0; i < N; i++ ) { + hptrX[i] = (float) rand() / RAND_MAX; + hptrY[i] = (float) rand() / RAND_MAX; + } + CUDART_CHECK( cudaEventRecord( evStart, 0 ) ); + saxpy<<>>( dptrOut, dptrX, dptrY, N, alpha ); + CUDART_CHECK( cudaEventRecord( evStop, 0 ) ); + CUDART_CHECK( cudaMemcpy( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost ) ); + CUDART_CHECK( cudaDeviceSynchronize() ); + for ( size_t i = 0; i < N; i++ ) { + if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) { + status = cudaErrorUnknown; + goto Error; + } + } + CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) ); +Error: + cudaEventDestroy( evStop ); + cudaEventDestroy( evStart ); + cudaFreeHost( hptrOut ); + cudaFreeHost( hptrX ); + cudaFreeHost( hptrY ); + return status; +} + +double +Bandwidth( float ms, double NumBytes ) +{ + return NumBytes / (1000.0*ms); +} + +int +main( int argc, char *argv[] ) +{ + cudaError_t status; + int N_Mfloats = 128; + size_t N; + int nBlocks = 1500; + int nThreads = 256; + float alpha = 2.0f; + + chCommandLineGet( &nBlocks, "nBlocks", argc, argv ); + chCommandLineGet( &nThreads, "nThreads", argc, argv ); + chCommandLineGet( &N_Mfloats, "N", argc, argv ); + printf( "Measuring times with %dM floats", N_Mfloats ); + if ( N_Mfloats==128 ) { + printf( " (use --N to specify number of Mfloats)"); + } + printf( "\n" ); + + N = 1048576*N_Mfloats; + + CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) ); + { + float msTotal; + CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) ); + printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) ); + } + +Error: + if ( status == cudaErrorMemoryAllocation ) { + printf( "Memory allocation failed\n" ); + } + else if ( cudaSuccess != status ) { + printf( "Failed\n" ); + } + return cudaSuccess != status; +}