Initial cut, streaming samples

13 years ago · 012ee91830
4 changed files with 724 additions and 0 deletions
--- a/streaming/stream1Async.cu
+++ b/streaming/stream1Async.cu
@ -0,0 +1,192 @@
				@@ -0,0 +1,192 @@
+/*
+ *
+ * stream1Async.cu
+ *
+ * Microbenchmark to illustrate a bandwidth-limited workload.
+ *
+ * It separately measures the host->device transfer time, kernel
+ * processing time, and device->host transfer time.  Due to low
+ * arithmetic density in the saxpy() kernel, the bulk of time
+ * is spent transferring data. 
+ *
+ * Build with: nvcc -I ../chLib stream1Async.cu
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <chError.h>
+#include <chCommandLine.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+// saxpy global function adds in[i]*alpha to each element out[i]
+//
+// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
+//
+
+__global__ void
+saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+                 i < N;
+                 i += blockDim.x*gridDim.x ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+void
+saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+cudaError_t
+MeasureTimes( 
+    float *msTotal,
+    float *msHtoD, 
+    float *msKernel, 
+    float *msDtoH, 
+    size_t N, 
+    float alpha,
+    int nBlocks, 
+    int nThreads )
+{
+    cudaError_t status;
+    float *dptrOut = 0, *hptrOut = 0;
+    float *dptrY = 0, *hptrY = 0;
+    float *dptrX = 0, *hptrX = 0;
+    cudaEvent_t evStart = 0;
+    cudaEvent_t evHtoD = 0;
+    cudaEvent_t evKernel = 0;
+    cudaEvent_t evDtoH = 0;
+
+    CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) );
+    memset( hptrOut, 0, N*sizeof(float) );
+    CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) );
+    CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaEventCreate( &evStart ) );
+    CUDART_CHECK( cudaEventCreate( &evHtoD ) );
+    CUDART_CHECK( cudaEventCreate( &evKernel ) );
+    CUDART_CHECK( cudaEventCreate( &evDtoH ) );
+    for ( size_t i = 0; i < N; i++ ) {
+        hptrX[i] = (float) rand() / RAND_MAX;
+        hptrY[i] = (float) rand() / RAND_MAX;
+    }
+    CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
+        CUDART_CHECK( cudaMemcpyAsync( dptrX, hptrX, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) );
+        CUDART_CHECK( cudaMemcpyAsync( dptrY, hptrY, N*sizeof(float), cudaMemcpyHostToDevice, NULL ) );
+    CUDART_CHECK( cudaEventRecord( evHtoD, 0 ) );
+        saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
+    CUDART_CHECK( cudaEventRecord( evKernel, 0 ) );
+        CUDART_CHECK( cudaMemcpyAsync( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost, NULL ) );
+    CUDART_CHECK( cudaEventRecord( evDtoH, 0 ) );
+    CUDART_CHECK( cudaDeviceSynchronize() );
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
+            status = cudaErrorUnknown;
+            goto Error;
+        }
+    }
+    CUDART_CHECK( cudaEventElapsedTime( msHtoD, evStart, evHtoD ) );
+    CUDART_CHECK( cudaEventElapsedTime( msKernel, evHtoD, evKernel ) );
+    CUDART_CHECK( cudaEventElapsedTime( msDtoH, evKernel, evDtoH ) );
+    CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evDtoH ) );
+Error:
+    cudaEventDestroy( evDtoH );
+    cudaEventDestroy( evKernel );
+    cudaEventDestroy( evHtoD );
+    cudaEventDestroy( evStart );
+    cudaFree( dptrOut );
+    cudaFree( dptrX );
+    cudaFree( dptrY );
+    cudaFreeHost( hptrX );
+    cudaFreeHost( hptrY );
+    return status;
+}
+
+double
+Bandwidth( float ms, double NumBytes )
+{
+    return NumBytes / (1000.0*ms);
+}
+
+int
+main( int argc, char *argv[] )
+{
+    cudaError_t status;
+    int N_Mfloats = 128;
+    size_t N;
+    int nBlocks = 1500;
+    int nThreads = 256;
+    float alpha = 2.0f;
+
+    chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
+    chCommandLineGet( &nThreads, "nThreads", argc, argv );
+    chCommandLineGet( &N_Mfloats, "N", argc, argv );
+    printf( "Measuring times with %dM floats", N_Mfloats );
+    if ( N_Mfloats==128 ) {
+        printf( " (use --N to specify number of Mfloats)");
+    }
+    printf( "\n" );
+
+    N = 1048576*N_Mfloats;
+
+    CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
+    {
+        float msTotal, msHtoD, msKernel, msDtoH;
+        CUDART_CHECK( MeasureTimes( &msTotal, &msHtoD, &msKernel, &msDtoH, N, alpha, nBlocks, nThreads ) );
+        printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
+        printf( "Memcpy( host->device ): %.2f ms (%.2f MB/s)\n", msHtoD, Bandwidth( msHtoD, 2*N*sizeof(float) ) );
+        printf( "Kernel processing     : %.2f ms (%.2f MB/s)\n", msKernel, Bandwidth( msKernel, 3*N*sizeof(float) ) );
+        printf( "Memcpy (device->host ): %.2f ms (%.2f MB/s)\n", msDtoH, Bandwidth( msDtoH, N*sizeof(float) ) );
+    }
+
+Error:
+    if ( status == cudaErrorMemoryAllocation ) {
+        printf( "Memory allocation failed\n" );
+    }
+    else if ( cudaSuccess != status ) {
+        printf( "Failed\n" );
+    }
+    return cudaSuccess != status;
+}
--- a/streaming/stream2Streams.cu
+++ b/streaming/stream2Streams.cu
@ -0,0 +1,206 @@
				@@ -0,0 +1,206 @@
+/*
+ *
+ * stream2Streams.cu
+ *
+ * Formulation of stream1Async.cu that uses streams to overlap data
+ * transfers and kernel processing.
+ *
+ * Build with: nvcc -I ../chLib stream2Streams.cu
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <chError.h>
+#include <chCommandLine.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+// saxpy global function adds in[i]*alpha to each element out[i]
+//
+// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
+//
+
+__global__ void
+saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+                 i < N;
+                 i += blockDim.x*gridDim.x ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+void
+saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+cudaError_t
+MeasureTimes( 
+    float *msTotal,
+    size_t N, 
+    float alpha,
+    int nStreams,
+    int nBlocks, 
+    int nThreads )
+{
+    cudaError_t status;
+    float *dptrOut = 0, *hptrOut = 0;
+    float *dptrY = 0, *hptrY = 0;
+    float *dptrX = 0, *hptrX = 0;
+    cudaStream_t *streams = 0;
+    cudaEvent_t evStart = 0;
+    cudaEvent_t evStop = 0;
+    size_t streamStep = N / nStreams;
+
+    if ( N % nStreams ) {
+        printf( "Stream count must be evenly divisible into N\n" );
+        status = cudaErrorInvalidValue;
+        goto Error;
+    }
+
+    streams = new cudaStream_t[nStreams];
+    if ( ! streams ) {
+        status = cudaErrorMemoryAllocation;
+        goto Error;
+    }
+    memset( streams, 0, nStreams*sizeof(cudaStream_t) );
+    for ( int i = 0; i < nStreams; i++ ) {
+        CUDART_CHECK( cudaStreamCreate( &streams[i] ) );
+    }
+    CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), 0 ) );
+    memset( hptrOut, 0, N*sizeof(float) );
+    CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), 0 ) );
+    CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), 0 ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrY, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaMalloc( &dptrX, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrY, 0, N*sizeof(float) ) );
+
+    CUDART_CHECK( cudaEventCreate( &evStart ) );
+    CUDART_CHECK( cudaEventCreate( &evStop ) );
+    for ( size_t i = 0; i < N; i++ ) {
+        hptrX[i] = (float) rand() / RAND_MAX;
+        hptrY[i] = (float) rand() / RAND_MAX;
+    }
+    CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
+
+    for ( int iStream = 0; iStream < nStreams; iStream++ ) {
+        CUDART_CHECK( cudaMemcpyAsync( dptrX+iStream*streamStep, hptrX+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) );
+        CUDART_CHECK( cudaMemcpyAsync( dptrY+iStream*streamStep, hptrY+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyHostToDevice, streams[iStream] ) );
+    }
+
+    for ( int iStream = 0; iStream < nStreams; iStream++ ) {
+        saxpy<<<nBlocks, nThreads, 0, streams[iStream]>>>( dptrOut+iStream*streamStep, dptrX+iStream*streamStep, dptrY+iStream*streamStep, streamStep, alpha );
+    }
+
+    for ( int iStream = 0; iStream < nStreams; iStream++ ) {
+        CUDART_CHECK( cudaMemcpyAsync( hptrOut+iStream*streamStep, dptrOut+iStream*streamStep, streamStep*sizeof(float), cudaMemcpyDeviceToHost, streams[iStream] ) );
+    }
+
+    CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
+    CUDART_CHECK( cudaDeviceSynchronize() );
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
+            status = cudaErrorUnknown;
+            goto Error;
+        }
+    }
+    CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
+Error:
+    if ( streams ) {
+        for ( int i = 0; i < nStreams; i++ ) {
+            cudaStreamDestroy( streams[i] );
+        }
+        delete[] streams;
+    }
+    cudaEventDestroy( evStart );
+    cudaEventDestroy( evStop );
+    cudaFree( dptrX );
+    cudaFree( dptrY );
+    cudaFreeHost( hptrX );
+    cudaFreeHost( hptrY );
+    return status;
+}
+
+double
+Bandwidth( float ms, double NumBytes )
+{
+    return NumBytes / (1000.0*ms);
+}
+
+int
+main( int argc, char *argv[] )
+{
+    cudaError_t status;
+    int N_Mfloats = 128;
+    size_t N;
+    int nStreams = 8;
+    int nBlocks = 1500;
+    int nThreads = 256;
+    float alpha = 2.0f;
+
+    chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
+    chCommandLineGet( &nThreads, "nThreads", argc, argv );
+    chCommandLineGet( &nThreads, "nStreams", argc, argv );
+    chCommandLineGet( &N_Mfloats, "N", argc, argv );
+    printf( "Measuring times with %dM floats", N_Mfloats );
+    if ( N_Mfloats==128 ) {
+        printf( " (use --N to specify number of Mfloats)");
+    }
+    printf( "\n" );
+
+    N = 1048576*N_Mfloats;
+
+    CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
+    {
+        float msTotal;
+        CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nStreams, nBlocks, nThreads ) );
+        printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
+    }
+
+Error:
+    if ( status == cudaErrorMemoryAllocation ) {
+        printf( "Memory allocation failed\n" );
+    }
+    else if ( cudaSuccess != status ) {
+        printf( "Failed\n" );
+    }
+    return cudaSuccess != status;
+}
--- a/streaming/stream3Mapped.cu
+++ b/streaming/stream3Mapped.cu
@ -0,0 +1,162 @@
				@@ -0,0 +1,162 @@
+/*
+ *
+ * stream3Mapped.cu
+ *
+ * Formulation of stream1Async.cu that uses mapped pinned memory to
+ * hold the input and output data.  Since the kernel can use mapped
+ * pinned memory to initiate DMA transfers across the bus, this
+ * version is simpler (no cudaMemcpy() calls) and just as fast.
+ *
+ * Build with: nvcc -I ../chLib stream3Mapped.cu
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <chError.h>
+#include <chCommandLine.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+// saxpy global function adds in[i]*alpha to each element out[i]
+//
+// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
+//
+
+__global__ void
+saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+                 i < N;
+                 i += blockDim.x*gridDim.x ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+void
+saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+cudaError_t
+MeasureTimes( 
+    float *msTotal,
+    size_t N, 
+    float alpha,
+    int nBlocks, 
+    int nThreads )
+{
+    cudaError_t status;
+    float *dptrOut = 0, *hptrOut = 0;
+    float *dptrY = 0, *hptrY = 0;
+    float *dptrX = 0, *hptrX = 0;
+    cudaEvent_t evStart = 0;
+    cudaEvent_t evStop = 0;
+
+    CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) );
+    memset( hptrOut, 0, N*sizeof(float) );
+    CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) );
+    CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) );
+
+    CUDART_CHECK( cudaEventCreate( &evStart ) );
+    CUDART_CHECK( cudaEventCreate( &evStop ) );
+    for ( size_t i = 0; i < N; i++ ) {
+        hptrX[i] = (float) rand() / RAND_MAX;
+        hptrY[i] = (float) rand() / RAND_MAX;
+    }
+    CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
+        saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
+    CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
+    CUDART_CHECK( cudaDeviceSynchronize() );
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
+            status = cudaErrorUnknown;
+            goto Error;
+        }
+    }
+    CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
+Error:
+    cudaEventDestroy( evStop );
+    cudaEventDestroy( evStart );
+    cudaFreeHost( hptrOut );
+    cudaFreeHost( hptrX );
+    cudaFreeHost( hptrY );
+    return status;
+}
+
+double
+Bandwidth( float ms, double NumBytes )
+{
+    return NumBytes / (1000.0*ms);
+}
+
+int
+main( int argc, char *argv[] )
+{
+    cudaError_t status;
+    int N_Mfloats = 128;
+    size_t N;
+    int nBlocks = 1500;
+    int nThreads = 256;
+    float alpha = 2.0f;
+
+    chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
+    chCommandLineGet( &nThreads, "nThreads", argc, argv );
+    chCommandLineGet( &N_Mfloats, "N", argc, argv );
+    printf( "Measuring times with %dM floats", N_Mfloats );
+    if ( N_Mfloats==128 ) {
+        printf( " (use --N to specify number of Mfloats)");
+    }
+    printf( "\n" );
+
+    N = 1048576*N_Mfloats;
+
+    CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
+    {
+        float msTotal;
+        CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) );
+        printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
+    }
+
+Error:
+    if ( status == cudaErrorMemoryAllocation ) {
+        printf( "Memory allocation failed\n" );
+    }
+    else if ( cudaSuccess != status ) {
+        printf( "Failed\n" );
+    }
+    return cudaSuccess != status;
+}
--- a/streaming/stream4MappedXfer.cu
+++ b/streaming/stream4MappedXfer.cu
@ -0,0 +1,164 @@
				@@ -0,0 +1,164 @@
+/*
+ *
+ * stream4MappedXfer.cu
+ *
+ * Formulation of stream1Async.cu that uses mapped pinned memory to 
+ * perform a computation on inputs in host memory while writing the 
+ * output to device memory.
+ *
+ * Build with: nvcc -I ../chLib stream4MappedXfer.cu
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <chError.h>
+#include <chCommandLine.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+// saxpy global function adds in[i]*alpha to each element out[i]
+//
+// Due to low arithmetic density, this kernel is extremely bandwidth-bound.
+//
+
+__global__ void
+saxpy( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = blockIdx.x*blockDim.x + threadIdx.x;
+                 i < N;
+                 i += blockDim.x*gridDim.x ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+void
+saxpyCPU( float *out, const float *x, const float *y, size_t N, float alpha )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        out[i] += alpha*x[i]+y[i];
+    }
+}
+
+cudaError_t
+MeasureTimes( 
+    float *msTotal,
+    size_t N, 
+    float alpha,
+    int nBlocks, 
+    int nThreads )
+{
+    cudaError_t status;
+    float *dptrOut = 0, *hptrOut = 0;
+    float *dptrY = 0, *hptrY = 0;
+    float *dptrX = 0, *hptrX = 0;
+    cudaEvent_t evStart = 0;
+    cudaEvent_t evStop = 0;
+
+    CUDART_CHECK( cudaHostAlloc( &hptrOut, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaMalloc( &dptrOut, N*sizeof(float) ) );
+    CUDART_CHECK( cudaMemset( dptrOut, 0, N*sizeof(float) ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrOut, hptrOut, 0 ) );
+    memset( hptrOut, 0, N*sizeof(float) );
+    CUDART_CHECK( cudaHostAlloc( &hptrY, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrY, hptrY, 0 ) );
+    CUDART_CHECK( cudaHostAlloc( &hptrX, N*sizeof(float), cudaHostAllocMapped ) );
+    CUDART_CHECK( cudaHostGetDevicePointer( &dptrX, hptrX, 0 ) );
+
+    CUDART_CHECK( cudaEventCreate( &evStart ) );
+    CUDART_CHECK( cudaEventCreate( &evStop ) );
+    for ( size_t i = 0; i < N; i++ ) {
+        hptrX[i] = (float) rand() / RAND_MAX;
+        hptrY[i] = (float) rand() / RAND_MAX;
+    }
+    CUDART_CHECK( cudaEventRecord( evStart, 0 ) );
+        saxpy<<<nBlocks, nThreads>>>( dptrOut, dptrX, dptrY, N, alpha );
+    CUDART_CHECK( cudaEventRecord( evStop, 0 ) );
+    CUDART_CHECK( cudaMemcpy( hptrOut, dptrOut, N*sizeof(float), cudaMemcpyDeviceToHost ) );
+    CUDART_CHECK( cudaDeviceSynchronize() );
+    for ( size_t i = 0; i < N; i++ ) {
+        if ( fabsf( hptrOut[i] - (alpha*hptrX[i]+hptrY[i]) ) > 1e-5f ) {
+            status = cudaErrorUnknown;
+            goto Error;
+        }
+    }
+    CUDART_CHECK( cudaEventElapsedTime( msTotal, evStart, evStop ) );
+Error:
+    cudaEventDestroy( evStop );
+    cudaEventDestroy( evStart );
+    cudaFreeHost( hptrOut );
+    cudaFreeHost( hptrX );
+    cudaFreeHost( hptrY );
+    return status;
+}
+
+double
+Bandwidth( float ms, double NumBytes )
+{
+    return NumBytes / (1000.0*ms);
+}
+
+int
+main( int argc, char *argv[] )
+{
+    cudaError_t status;
+    int N_Mfloats = 128;
+    size_t N;
+    int nBlocks = 1500;
+    int nThreads = 256;
+    float alpha = 2.0f;
+
+    chCommandLineGet( &nBlocks, "nBlocks", argc, argv );
+    chCommandLineGet( &nThreads, "nThreads", argc, argv );
+    chCommandLineGet( &N_Mfloats, "N", argc, argv );
+    printf( "Measuring times with %dM floats", N_Mfloats );
+    if ( N_Mfloats==128 ) {
+        printf( " (use --N to specify number of Mfloats)");
+    }
+    printf( "\n" );
+
+    N = 1048576*N_Mfloats;
+
+    CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
+    {
+        float msTotal;
+        CUDART_CHECK( MeasureTimes( &msTotal, N, alpha, nBlocks, nThreads ) );
+        printf( "Total time: %.2f ms (%.2f MB/s)\n", msTotal, Bandwidth( msTotal, 3*N*sizeof(float) ) );
+    }
+
+Error:
+    if ( status == cudaErrorMemoryAllocation ) {
+        printf( "Memory allocation failed\n" );
+    }
+    else if ( cudaSuccess != status ) {
+        printf( "Failed\n" );
+    }
+    return cudaSuccess != status;
+}