Add normalized correlation code

13 years ago · 40457387da
16 changed files with 2272 additions and 0 deletions
--- a/corr/coins.bmp
+++ b/corr/coins.bmp
--- a/corr/coins.pgm
+++ b/corr/coins.pgm
--- a/corr/corrShared.cuh
+++ b/corr/corrShared.cuh
@ -0,0 +1,150 @@
				@@ -0,0 +1,150 @@
+/*
+ *
+ * corrShared.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+
+__global__ void 
+corrShared_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                       (float) (uTile+col+xUL+xOffset), 
+                       (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {    
+            for ( int i = 0; i < wTemplate; i++) { 
+                unsigned char I = LocalBlock[SharedIdx+i];
+                unsigned char T = g_Tpix[idx++];
+                SumI += I;
+                SumISq += I*I;
+                SumIT += I*T;
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrShared( 
+    float *dCorr, int CorrPitch,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexConstant( 
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    corrShared_kernel<<<blocks, threads, sharedMem>>>(
+        dCorr, CorrPitch,
+        wTile,
+        wTemplate, hTemplate,
+        (float) xOffset, (float) yOffset,
+        cPixels, fDenomExp, 
+        sharedPitch, 
+        (float) xUL, (float) yUL, w, h );
+Error:
+    return;
+}
--- a/corr/corrShared4.cuh
+++ b/corr/corrShared4.cuh
@ -0,0 +1,186 @@
				@@ -0,0 +1,186 @@
+/*
+ *
+ * corrShared.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory.  This implementation includes the 
+ * SM-aware optimizations of corrSharedSMSums.cuh and also 
+ * unrolls the innermost loop 4x.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+
+template<bool bSM1>
+__global__ void 
+corrShared4_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                      (float) (uTile+col+xUL+xOffset), 
+                      (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {
+            for ( int i = 0; i < wTemplate/4; i++) { 
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+0], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+1], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+2], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+3], g_Tpix[idx++] );
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrShared4( 
+    float *dCorr, int CorrPitch,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexConstant( 
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    if ( wTemplate % 4 ) {
+        if ( props.major == 1 ) {
+            corrSharedSM_kernel<true><<<blocks, threads, sharedMem>>>(
+                dCorr, CorrPitch,
+                wTile,
+                wTemplate, hTemplate,
+                (float) xOffset, (float) yOffset,
+                cPixels, fDenomExp, 
+                sharedPitch, 
+                (float) xUL, (float) yUL, w, h );
+        }
+        else {
+            corrSharedSM_kernel<false><<<blocks, threads, sharedMem>>>(
+                dCorr, CorrPitch,
+                wTile,
+                wTemplate, hTemplate,
+                (float) xOffset, (float) yOffset,
+                cPixels, fDenomExp, 
+                sharedPitch, 
+                (float) xUL, (float) yUL, w, h );
+        }
+    }
+    if ( props.major == 1 ) {
+        corrShared4_kernel<true><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+    else {
+        corrShared4_kernel<false><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+Error:
+    return;
+}
--- a/corr/corrShared4Sums.cuh
+++ b/corr/corrShared4Sums.cuh
@ -0,0 +1,217 @@
				@@ -0,0 +1,217 @@
+/*
+ *
+ * corrShared4Sums.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory, and reports the sums (I, IT, ISq)
+ * as well as the output coefficients.  This implementation
+ * includes the SM-aware optimizations of corrSharedSMSums.cuh
+ * and also unrolls the innermost loop 4x.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+template<bool bSM1>
+__device__ void
+corrSharedAccumulate( 
+    int& SumI, int& SumISq, int& SumIT, 
+    unsigned char I, unsigned char T )
+{
+    SumI += I;
+    if ( bSM1 ) {
+        SumISq += __umul24(I,I);
+        SumIT += __umul24(I,T);
+    }
+    else {
+        SumISq += I*I;
+        SumIT += I*T;
+    }
+}
+
+template<bool bSM1>
+__global__ void 
+corrShared4Sums_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int *pI, int *pISq, int *pIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+    pI = (int *) (((char *) pI)+v*CorrPitch);
+    pISq = (int *) (((char *) pISq)+v*CorrPitch);
+    pIT = (int *) (((char *) pIT)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                       (float) (uTile+col+xUL+xOffset), 
+                       (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {    
+            for ( int i = 0; i < wTemplate/4; i++) { 
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+0], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+1], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+2], g_Tpix[idx++] );
+                corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+3], g_Tpix[idx++] );
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pI[uTile+col] = SumI;
+            pISq[uTile+col] = SumISq;
+            pIT[uTile+col] = SumIT;
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrShared4Sums( 
+    float *dCorr, int CorrPitch,
+    int *dSumI, int *dSumISq, int *dSumIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexConstantSums( 
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    if ( wTemplate%4 ) {
+        if ( props.major == 1 ) {
+            corrSharedSMSums_kernel<true><<<blocks, threads, sharedMem>>>(
+                dCorr, CorrPitch,
+                dSumI, dSumISq, dSumIT,
+                wTile,
+                wTemplate, hTemplate,
+                (float) xOffset, (float) yOffset,
+                cPixels, fDenomExp, 
+                sharedPitch, 
+                (float) xUL, (float) yUL, w, h );
+        }
+        else {
+            corrSharedSMSums_kernel<false><<<blocks, threads, sharedMem>>>(
+                dCorr, CorrPitch,
+                dSumI, dSumISq, dSumIT,
+                wTile,
+                wTemplate, hTemplate,
+                (float) xOffset, (float) yOffset,
+                cPixels, fDenomExp, 
+                sharedPitch, 
+                (float) xUL, (float) yUL, w, h );
+        }
+    }
+    if ( props.major == 1 ) {
+        corrShared4Sums_kernel<true><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+    else {
+        corrShared4Sums_kernel<false><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+Error:
+    return;
+}
--- a/corr/corrSharedSM.cuh
+++ b/corr/corrSharedSM.cuh
@ -0,0 +1,171 @@
				@@ -0,0 +1,171 @@
+/*
+ *
+ * corrSharedSM.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory.  This implementation is SM-aware;
+ * it launches different kernels depending on whether it's
+ * running on SM 1.x hardware.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+
+template<bool bSM1>
+__global__ void 
+corrSharedSM_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                       (float) (uTile+col+xUL+xOffset), 
+                       (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {    
+            for ( int i = 0; i < wTemplate; i++) { 
+                unsigned char I = LocalBlock[SharedIdx+i];
+                unsigned char T = g_Tpix[idx++];
+                SumI += I;
+                if ( bSM1 ) {
+                    SumISq += __umul24(I, I);
+                    SumIT += __umul24(I, T);
+                }
+                else {
+                    SumISq += I*I;
+                    SumIT += I*T;
+                }
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrSharedSM(
+    float *dCorr, int CorrPitch,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexConstant( 
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    if ( props.major == 1 ) {
+        corrSharedSM_kernel<true><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+    else {
+        corrSharedSM_kernel<false><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+Error:
+    return;
+}
--- a/corr/corrSharedSMSums.cuh
+++ b/corr/corrSharedSMSums.cuh
@ -0,0 +1,183 @@
				@@ -0,0 +1,183 @@
+/*
+ *
+ * corrSharedSums.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory, and reports the sums (I, IT, ISq)
+ * as well as the output coefficients.  This implementation 
+ * is SM-aware; it launches different kernels depending on 
+ * whether it's running on SM 1.x hardware.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+template<bool bSM1>
+__global__ void 
+corrSharedSMSums_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int *pI, int *pISq, int *pIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+    pI = (int *) (((char *) pI)+v*CorrPitch);
+    pISq = (int *) (((char *) pISq)+v*CorrPitch);
+    pIT = (int *) (((char *) pIT)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                      (float) (uTile+col+xUL+xOffset), 
+                      (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {    
+            for ( int i = 0; i < wTemplate; i++) { 
+                unsigned char I = LocalBlock[SharedIdx+i];
+                unsigned char T = g_Tpix[idx++];
+                SumI += I;
+                if ( bSM1 ) {
+                    SumISq += __umul24( I, I );
+                    SumIT += __umul24( I, T );
+                }
+                else {
+                    SumISq += I*I;
+                    SumIT += I*T;
+                }
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pI[uTile+col] = SumI;
+            pISq[uTile+col] = SumISq;
+            pIT[uTile+col] = SumIT;
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrSharedSMSums( 
+    float *dCorr, int CorrPitch,
+    int *dSumI, int *dSumISq, int *dSumIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexTexSums( 
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    if ( props.major == 1 ) {
+        corrSharedSMSums_kernel<true><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+    else {
+        corrSharedSMSums_kernel<false><<<blocks, threads, sharedMem>>>(
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            (float) xOffset, (float) yOffset,
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            (float) xUL, (float) yUL, w, h );
+    }
+Error:
+    return;
+}
--- a/corr/corrSharedSums.cuh
+++ b/corr/corrSharedSums.cuh
@ -0,0 +1,161 @@
				@@ -0,0 +1,161 @@
+/*
+ *
+ * corrSharedSums.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the image from shared memory and the template 
+ * from constant memory, and reports the sums (I, IT, ISq)
+ * as well as the output coefficients.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+extern __shared__ unsigned char LocalBlock[];
+
+__global__ void 
+corrSharedSums_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int *pI, int *pISq, int *pIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float xOffset, float yOffset,
+    float cPixels, float fDenomExp, int SharedPitch,
+    float xUL, float yUL, int w, int h )
+{ 
+    int uTile = blockIdx.x*wTile;
+    int vTile = blockIdx.y*blockDim.y;
+    int v = vTile + threadIdx.y;
+
+    float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
+    pI = (int *) (((char *) pI)+v*CorrPitch);
+    pISq = (int *) (((char *) pISq)+v*CorrPitch);
+    pIT = (int *) (((char *) pIT)+v*CorrPitch);
+
+    for ( int row = threadIdx.y; 
+              row < blockDim.y+hTemplate; 
+              row += blockDim.y ) {
+        int SharedIdx = row * SharedPitch;
+
+        for ( int col = threadIdx.x; 
+                  col < wTile+wTemplate; 
+                  col += blockDim.x ) {
+
+            LocalBlock[SharedIdx+col] = 
+                tex2D( texImage, 
+                      (float) (uTile+col+xUL+xOffset), 
+                      (float) (vTile+row+yUL+yOffset) );
+
+        }
+    }
+
+    __syncthreads();
+
+    for ( int col = threadIdx.x; 
+              col < wTile; 
+              col += blockDim.x ) {
+
+        int SumI = 0;
+        int SumISq = 0;
+        int SumIT = 0;
+        int idx = 0;
+        int SharedIdx = threadIdx.y * SharedPitch + col;
+        for ( int j = 0; j < hTemplate; j++ ) {    
+            for ( int i = 0; i < wTemplate; i++) { 
+                unsigned char I = LocalBlock[SharedIdx+i];
+                unsigned char T = g_Tpix[idx++];
+                SumI += I;
+                SumISq += I*I;
+                SumIT += I*T;
+            }
+            SharedIdx += SharedPitch;
+        }
+        if ( uTile+col < w && v < h ) {
+            pI[uTile+col] = SumI;
+            pISq[uTile+col] = SumISq;
+            pIT[uTile+col] = SumIT;
+            pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
+        }
+    }
+    __syncthreads();
+}
+
+
+void
+corrSharedSums( 
+    float *dCorr, int CorrPitch,
+    int *dSumI, int *dSumISq, int *dSumIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    int device;
+    cudaDeviceProp props;
+    cudaError_t status;
+
+    CUDART_CHECK( cudaGetDevice( &device ) );
+    CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
+    if ( sharedMem > props.sharedMemPerBlock ) {
+        dim3 tcThreads(32, 16, 1);
+        dim3 tcBlocks;
+        tcBlocks.x = INTCEIL(w,threads.x); 
+        tcBlocks.y = INTCEIL(h,threads.y); 
+        tcBlocks.z = 1;
+        return corrTexConstantSums( 
+            dCorr, CorrPitch,
+            dSumI, dSumISq, dSumIT,
+            wTile,
+            wTemplate, hTemplate,
+            cPixels,
+            fDenomExp,
+            sharedPitch,
+            xOffset, yOffset,
+            xTemplate, yTemplate,
+            xUL, yUL, w, h,
+            tcThreads, tcBlocks,
+            sharedMem );
+    }
+    corrSharedSums_kernel<<<blocks, threads, sharedMem>>>(
+        dCorr, CorrPitch,
+        dSumI, dSumISq, dSumIT,
+        wTile,
+        wTemplate, hTemplate,
+        (float) xOffset, (float) yOffset,
+        cPixels, fDenomExp, 
+        sharedPitch, 
+        (float) xUL, (float) yUL, w, h );
+Error:
+    return;
+}
--- a/corr/corrTexConstant.cuh
+++ b/corr/corrTexConstant.cuh
@ -0,0 +1,94 @@
				@@ -0,0 +1,94 @@
+/*
+ *
+ * corrTexConstant.cuh
+ *
+ * Header file for 2D implementation of normalized correlation
+ * that reads the template from constant memory.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+__global__ void 
+corrTexConstant_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    float cPixels, float fDenomExp,
+    float xUL, float yUL, int w, int h,
+    int xOffset, int yOffset,
+    int wTemplate, int hTemplate )
+{
+    size_t row = blockIdx.y*blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // adjust pointers to row
+    pCorr = (float *) ((char *) pCorr+row*CorrPitch);
+
+    // No __syncthreads in this kernel, so we can early-out
+    // without worrying about the effects of divergence.
+    if ( col >= w || row >= h )
+        return;
+
+    int SumI = 0;
+    int SumISq = 0;
+    int SumIT = 0;
+    int inx = 0;
+
+    for ( int j = 0; j < hTemplate; j++ ) {
+        for ( int i = 0; i < wTemplate; i++ ) {
+            unsigned char I = tex2D( texImage, 
+                                     (float) col+xUL+xOffset+i, 
+                                     (float) row+yUL+yOffset+j );
+            unsigned char T = g_Tpix[inx++];
+            SumI += I;
+            SumISq += I*I;
+            SumIT += I*T;
+        }
+    }
+    pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
+}
+
+void
+corrTexConstant( 
+    float *dCorr, int CorrPitch,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    corrTexConstant_kernel<<<blocks, threads>>>(
+        dCorr, CorrPitch,
+        cPixels, fDenomExp, 
+        (float) xUL, (float) yUL, w, h,
+        xOffset, yOffset, wTemplate, hTemplate );
+}
--- a/corr/corrTexConstantSums.cuh
+++ b/corr/corrTexConstantSums.cuh
@ -0,0 +1,105 @@
				@@ -0,0 +1,105 @@
+/*
+ *
+ * corrTexConstant.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the template from constant memory, and reports
+ * the sums (I, IT, ISq) as well as the output coefficients.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+__global__ void 
+corrTexConstantSums_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int *pI, int *pISq, int *pIT,
+    float cPixels, float fDenomExp,
+    float xUL, float yUL, int w, int h,
+    int xOffset, int yOffset, 
+    int wTemplate, int hTemplate )
+{
+    size_t row = blockIdx.y*blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // adjust pointers to row
+    pCorr = (float *) ((char *) pCorr+row*CorrPitch);
+    pI    = (int *) ((char *) pI  +row*CorrPitch);
+    pISq  = (int *) ((char *) pISq+row*CorrPitch);
+    pIT   = (int *) ((char *) pIT +row*CorrPitch);
+
+    // No __syncthreads in this kernel, so we can early-out
+    // without worrying about the effects of divergence.
+    if ( col >= w || row >= h )
+        return;
+
+    int SumI = 0;
+    int SumISq = 0;
+    int SumIT = 0;
+
+    int inx = 0;
+
+    for ( int j = 0; j < hTemplate; j++ ) {
+        for ( int i = 0; i < wTemplate; i++ ) {
+            unsigned char I = tex2D( texImage, 
+                                     (float) col+xUL+xOffset+i, 
+                                     (float) row+yUL+yOffset+j );
+            unsigned char T = g_Tpix[inx++];
+            SumI += I;
+            SumISq += I*I;
+            SumIT += I*T;
+        }
+    }
+    pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
+    pI[col] = SumI;
+    pISq[col] = SumISq;
+    pIT[col] = SumIT;
+}
+
+void
+corrTexConstantSums( 
+    float *dCorr, int CorrPitch,
+    int *dSumI, int *dSumISq, int *dSumIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    corrTexConstantSums_kernel<<<blocks, threads>>>(
+        dCorr, CorrPitch,
+        dSumI, dSumISq, dSumIT,
+        cPixels, fDenomExp, 
+        (float) xUL, (float) yUL, w, h,
+        xOffset, yOffset, wTemplate, hTemplate );
+}
--- a/corr/corrTexTex.cuh
+++ b/corr/corrTexTex.cuh
@ -0,0 +1,101 @@
				@@ -0,0 +1,101 @@
+/*
+ *
+ * corrTexTex.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads both image and template from texture.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+__global__ void 
+corrTexTex_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    float cPixels,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int wTemplate, int hTemplate,
+    float xUL, float yUL, int w, int h )
+{
+    size_t row = blockIdx.y*blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // adjust pCorr to point to row
+    pCorr = (float *) ((char *) pCorr+row*CorrPitch);
+
+    // No __syncthreads in this kernel, so we can early-out
+    // without worrying about the effects of divergence.
+    if ( col >= w || row >= h )
+        return;
+
+    int SumI = 0;
+    int SumT = 0;
+    int SumISq = 0;
+    int SumTSq = 0;
+    int SumIT = 0;
+    for ( int y = 0; y < hTemplate; y++ ) {
+        for ( int x = 0; x < wTemplate; x++ ) {
+            unsigned char I = tex2D( texImage, 
+                (float) col+xUL+xOffset+x, (float) row+yUL+yOffset+y );
+            unsigned char T = tex2D( texTemplate, 
+                (float) xTemplate+x, (float) yTemplate+y);
+            SumI += I;
+            SumT += T;
+            SumISq += I*I;
+            SumTSq += T*T;
+            SumIT += I*T;
+        }
+        float fDenomExp = (float) 
+            ( (double) cPixels*SumTSq - (double) SumT*SumT);
+        pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, SumT, cPixels, fDenomExp );
+    }
+}
+
+void
+corrTexTex( 
+    float *dCorr, int CorrPitch,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    corrTexTex_kernel<<<blocks, threads>>>( 
+        dCorr, CorrPitch,
+        cPixels,
+        xOffset, yOffset,
+        xTemplate+xOffset, yTemplate+yOffset,
+        wTemplate, hTemplate,
+        (float) xUL, (float) yUL, w, h );
+}
--- a/corr/corrTexTexSums.cuh
+++ b/corr/corrTexTexSums.cuh
@ -0,0 +1,115 @@
				@@ -0,0 +1,115 @@
+/*
+ *
+ * corrTexTexSums.cuh
+ *
+ * Header file for implementation of normalized correlation
+ * that reads the template from texture, and reports the
+ * sums (I, IT, ISq) in addition to the coefficients.
+ *
+ * Copyright (c) 2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+__global__ void 
+corrTexTexSums_kernel( 
+    float *pCorr, size_t CorrPitch, 
+    int *pI, int *pISq, int *pIT,
+    float cPixels,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate, 
+    int wTemplate, int hTemplate,
+    float xUL, float yUL, int w, int h )
+{
+    size_t row = blockIdx.y*blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // adjust pointers to row
+    pCorr = (float *) ((char *) pCorr+row*CorrPitch);
+    pI    = (int *) ((char *) pI  +row*CorrPitch);
+    pISq  = (int *) ((char *) pISq+row*CorrPitch);
+    pIT   = (int *) ((char *) pIT +row*CorrPitch);
+
+    // No __syncthreads in this kernel, so we can early-out
+    // without worrying about the effects of divergence.
+    if ( col >= w || row >= h )
+        return;
+
+    int SumI = 0;
+    int SumT = 0;
+    int SumISq = 0;
+    int SumTSq = 0;
+    int SumIT = 0;
+    for ( int y = 0; y < hTemplate; y++ ) {
+        for ( int x = 0; x < wTemplate; x++ ) {
+            unsigned char I = 
+                tex2D( texImage, 
+                      (float) col+xUL+xOffset+x, 
+                      (float) row+yUL+yOffset+y );
+            unsigned char T = 
+                tex2D( texTemplate, 
+                      (float) xTemplate+x, 
+                      (float) yTemplate+y);
+            SumI += I;
+            SumT += T;
+            SumISq += I*I;
+            SumTSq += T*T;
+            SumIT += I*T;
+        }
+        float fDenomExp = (float) 
+            ((double) cPixels*SumTSq - (double) SumT*SumT);
+        pI[col] = SumI;
+        pISq[col] = SumISq;
+        pIT[col] = SumIT;
+        pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, SumT, cPixels, fDenomExp );
+    }
+}
+
+void
+corrTexTexSums( 
+    float *dCorr, int CorrPitch,
+    int *pI, int *pISq, int *pIT,
+    int wTile,
+    int wTemplate, int hTemplate,
+    float cPixels,
+    float fDenomExp,
+    int sharedPitch,
+    int xOffset, int yOffset,
+    int xTemplate, int yTemplate,
+    int xUL, int yUL, int w, int h,
+    dim3 threads, dim3 blocks,
+    int sharedMem )
+{
+    corrTexTexSums_kernel<<<blocks, threads>>>( 
+        dCorr, CorrPitch,
+        pI, pISq, pIT,
+        cPixels,
+        xOffset, yOffset,
+        xTemplate+xOffset, yTemplate+yOffset, 
+        wTemplate, hTemplate,
+        (float) xUL, (float) yUL, w, h );
+}
--- a/corr/normalizedCrossCorrelation.cu
+++ b/corr/normalizedCrossCorrelation.cu
@ -0,0 +1,641 @@
				@@ -0,0 +1,641 @@
+/*
+ *
+ * normalizedCrossCorrelation.cu
+ *
+ * Microbenchmark for normalized cross correlation, a template-
+ * matching algorithm for computer vision.
+ *
+ * Build with: nvcc -I ../chLib <options> normalizedCrossCorrelation.cu pgm.cu
+ *
+ * Make sure to include pgm.cu for the image file I/O support.
+ *
+ * To avoid warnings about double precision support, specify the
+ * target gpu-architecture, e.g.:
+ * nvcc --gpu-architecture sm_13 -I ../chLib <options> normalizedCrossCorrelation.cu pgm.cu
+ *
+ * Requires: No minimum SM requirement.
+ *
+ * Copyright (c) 2011-2012, Archaea Software, LLC.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <chError.h>
+#include <chCommandLine.h>
+#include <chAssert.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+
+#include "pgm.h"
+
+texture<unsigned char, 2> texImage;
+texture<unsigned char, 2> texTemplate;
+
+const int maxTemplatePixels = 3072;
+__constant__ int g_xOffset[maxTemplatePixels];
+__constant__ int g_yOffset[maxTemplatePixels];
+__constant__ unsigned char g_Tpix[maxTemplatePixels];
+__constant__ float g_cPixels, g_SumT, g_fDenomExp;
+unsigned int g_cpuSumT, g_cpuSumTSq;
+
+const float fThreshold = 1e-3f;
+
+#define INTCEIL(a,b) ( ((a)+(b)-1) / (b) )
+
+__device__ __host__ inline float
+CorrelationValue( float SumI, float SumISq, float SumIT, float SumT, float cPixels, float fDenomExp )
+{
+    float Numerator = cPixels*SumIT - SumI*SumT;
+    float Denominator = rsqrtf( (cPixels*SumISq - SumI*SumI)*fDenomExp );
+    return Numerator * Denominator;
+}
+
+#include "corrTexTexSums.cuh"
+#include "corrTexTex.cuh"
+
+#include "corrTexConstantSums.cuh"
+#include "corrTexConstant.cuh"
+
+extern __shared__ unsigned char LocalBlock[];
+
+#include "corrSharedSMSums.cuh"
+#include "corrSharedSM.cuh"
+
+#include "corrSharedSums.cuh"
+#include "corrShared.cuh"
+
+#include "corrShared4Sums.cuh"
+#include "corrShared4.cuh"
+
+int poffsetx[maxTemplatePixels];
+int poffsety[maxTemplatePixels];
+
+cudaError_t
+CopyToTemplate( 
+      unsigned char *img, size_t imgPitch, 
+      int xTemplate, int yTemplate,
+      int wTemplate, int hTemplate,
+      int OffsetX, int OffsetY
+)
+{
+    cudaError_t status;
+    unsigned char pixels[maxTemplatePixels];
+
+    int inx = 0;
+    int SumT = 0;
+    int SumTSq = 0;
+    int cPixels = wTemplate*hTemplate;
+    size_t sizeOffsets = cPixels*sizeof(int);
+    float fSumT, fDenomExp, fcPixels;
+
+    CUDART_CHECK( 
+        cudaMemcpy2D( 
+            pixels, wTemplate,
+            img+yTemplate*imgPitch+xTemplate, imgPitch,
+            wTemplate, hTemplate,
+            cudaMemcpyDeviceToHost ) );
+
+    CUDART_CHECK( cudaMemcpyToSymbol( g_Tpix, pixels, cPixels ) );
+
+    for ( int i = OffsetY; i < OffsetY+hTemplate; i++ ) {
+        for ( int j = OffsetX; j < OffsetX+wTemplate; j++) {
+            SumT += pixels[inx];
+            SumTSq += pixels[inx]*pixels[inx];
+            poffsetx[inx] = j;
+            poffsety[inx] = i;
+            inx += 1;
+        }
+    }
+    g_cpuSumT = SumT;
+    g_cpuSumTSq = SumTSq;
+
+    CUDART_CHECK( cudaMemcpyToSymbol(g_xOffset, poffsetx, sizeOffsets) );
+    CUDART_CHECK( cudaMemcpyToSymbol(g_yOffset, poffsety, sizeOffsets) );
+
+    fSumT = (float) SumT;
+    CUDART_CHECK( cudaMemcpyToSymbol(g_SumT, &fSumT, sizeof(float)) );
+
+    fDenomExp = (float) ( (double)cPixels*SumTSq - (double) SumT*SumT);
+    CUDART_CHECK( cudaMemcpyToSymbol(g_fDenomExp, &fDenomExp, sizeof(float)) );
+
+    fcPixels = (float) cPixels;
+    CUDART_CHECK( cudaMemcpyToSymbol(g_cPixels, &fcPixels, sizeof(float)) );
+Error:
+    return status;
+}
+
+int
+bCompareCorrValues( const float *pBase0, 
+                    const float *pBase1, 
+                    int w, int h )
+{
+    for ( int j = 0; j < h; j++ ) {
+
+        float *pf0 = (float *) ((char *) pBase0+j*w*sizeof(float));
+        float *pf1 = (float *) ((char *) pBase1+j*w*sizeof(float));
+
+        for ( int i = 0; i < w; i++ ) {
+            if ( fabsf(pf0[i]-pf1[i]) > fThreshold ) { 
+                printf( "Mismatch pf0[%d] = %.5f, pf1[%d] = %.5f\n", i, pf0[i], i, pf1[i] ); 
+                fflush( stdout );
+                //CH_ASSERT(0);
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+int
+bCompareSums( const int *pBaseI0, const int *pBaseISq0, const int *pBaseIT0,
+              const int *pBaseI1, const int *pBaseISq1, const int *pBaseIT1,
+              int w, int h )
+{
+    for ( int j = 0; j < h; j++ ) {
+
+        const int *pi0 = (const int *) ((char *) pBaseI0+j*w*sizeof(int));
+        const int *pi1 = (const int *) ((char *) pBaseI1+j*w*sizeof(int));
+
+        const int *pisq0 = (const int *) ((char *) pBaseISq0+j*w*sizeof(int));
+        const int *pisq1 = (const int *) ((char *) pBaseISq1+j*w*sizeof(int));
+
+        const int *pit0 = (const int *) ((char *) pBaseIT0+j*w*sizeof(int));
+        const int *pit1 = (const int *) ((char *) pBaseIT1+j*w*sizeof(int));
+        for ( int i = 0; i < w; i++ ) {
+            if ( pi0[i] != pi1[i] ||
+                 pisq0[i] != pisq1[i] ||
+                 pit0[i] != pit1[i] ) { 
+                printf( "Mismatch pi[%d] = %d, reference = %d\n", i, pi0[i], pi1[i] ); 
+                printf( "Mismatch pisq[%d] = %d, reference = %d\n", i, pisq0[i], pisq1[i] );
+                printf( "Mismatch pit[%d] = %d, reference = %d\n", i, pit0[i], pit1[i] );
+                fflush( stdout );
+                //CH_ASSERT(0);
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+unsigned char
+ReadPixel( unsigned char *base, int pitch, int w, int h, int x, int y )
+{
+    if ( x < 0 ) x = 0;
+    if ( x >= w ) x = w-1;
+    if ( y < 0 ) y = 0;
+    if ( y >= h ) y = h-1;
+    return base[y*pitch+x];
+}
+
+void 
+corrCPU( float *pCorr, 
+         int *_pI, int *_pISq, int *_pIT,
+         size_t CorrPitch, 
+         int cPixels,
+         int xTemplate, int yTemplate,
+         int w, int h,
+         unsigned char *img, int imgPitch,
+         unsigned char *tmp, int tmpPitch )
+{
+    for ( int row = 0; row < h; row += 1 ) {
+        float *pOut = (float *) (((char *) pCorr)+row*CorrPitch);
+        int *pI = (int *) (((char *) _pI)+row*CorrPitch);
+        int *pISq = (int *) (((char *) _pISq)+row*CorrPitch);
+        int *pIT = (int *) (((char *) _pIT)+row*CorrPitch);
+        for ( int col = 0; col < w; col += 1 ) {
+            int SumI = 0;
+            int SumT = 0;
+            int SumISq = 0;
+            int SumTSq = 0;
+            int SumIT = 0;
+            for ( int j = 0; j < cPixels; j++ ) {
+                unsigned char I = ReadPixel( img, imgPitch, w, h, col+poffsetx[j], row+poffsety[j] );
+                unsigned char T = ReadPixel( tmp, tmpPitch, w, h, xTemplate+poffsetx[j], yTemplate+poffsety[j] );
+                SumI += I;
+                SumT += T;
+                SumISq += I*I;
+                SumTSq += T*T;
+                SumIT += I*T;
+            }
+            float fDenomExp = (float) ((double) cPixels*SumTSq - (double) SumT*SumT);
+            pI[col] = SumI;
+            pISq[col] = SumISq;
+            pIT[col] = SumIT;
+            pOut[col] = CorrelationValue( (float) SumI, (float) SumISq, (float) SumIT, (float) SumT, (float) cPixels, fDenomExp );
+        }
+    }
+}
+
+bool
+TestCorrelation( 
+    double *pixelsPerSecond,         // passbacks to report performance
+    double *templatePixelsPerSecond, // 
+    int xOffset, int yOffset,  // offset into image
+    int w, int h,              // width and height of output
+    const float *hrefCorr,     // host reference data
+    const int *hrefSumI,
+    const int *hrefSumISq, 
+    const int *hrefSumIT,
+    int xTemplate, int yTemplate, // reference point in template image
+    int wTemplate, int hTemplate,
+    int wTile,                 // width of image tile
+    int sharedPitch, int sharedMem,
+    dim3 threads, dim3 blocks,
+    void (*pfnCorrelationSums)( 
+        float *dCorr, int CorrPitch,
+        int *dSumI, int *dSumISq, int *dSumIT,
+        int wTile,
+        int wTemplate, int hTemplate,
+        float cPixels,
+        float fDenomExp,
+        int sharedPitch,
+        int xOffset, int yOffset,
+        int xTemplate, int yTemplate,
+        int xUL, int yUL, int w, int h,
+        dim3 threads, dim3 blocks,
+        int sharedMem ),
+    void (*pfnCorrelation)( 
+        float *dCorr, int CorrPitch,
+        int wTile,
+        int wTemplate, int hTemplate,
+        float cPixels,
+        float fDenomExp,
+        int sharedPitch,
+        int xOffset, int yOffset,
+        int xTemplate, int yTemplate,
+        int xUL, int yUL, int w, int h,
+        dim3 threads, dim3 blocks,
+        int sharedMem ),
+    bool bPrintNeighborhood = false,
+    int cIterations = 1,
+    const char *outputFilename = NULL
+)
+{
+    cudaError_t status;
+    bool ret = false;
+    size_t CorrPitch;
+
+    float cPixels = (float) wTemplate*hTemplate;
+    float fDenomExp = (float) ((double) cPixels*g_cpuSumTSq - (double) g_cpuSumT*g_cpuSumT);
+
+    float *hCorr = NULL, *dCorr = NULL;
+    int *hSumI = NULL, *dSumI = NULL;
+    int *hSumISq = NULL, *dSumISq = NULL;
+    int *hSumIT = NULL, *dSumIT = NULL;
+
+    cudaEvent_t start = 0, stop = 0;
+
+    hCorr = (float *) malloc( w*sizeof(float)*h );
+    hSumI = (int *) malloc( w*sizeof(int)*h );
+    hSumISq = (int *) malloc( w*sizeof(int)*h );
+    hSumIT = (int *) malloc( w*sizeof(int)*h );
+    if ( NULL == hCorr || NULL == hSumI || NULL == hSumISq || NULL == hSumIT )
+        goto Error;
+
+    CUDART_CHECK( cudaMallocPitch( (void **) &dCorr, &CorrPitch, w*sizeof(float), h ) );
+    CUDART_CHECK( cudaMallocPitch( (void **) &dSumI, &CorrPitch, w*sizeof(int), h ) );
+    CUDART_CHECK( cudaMallocPitch( (void **) &dSumISq, &CorrPitch, w*sizeof(int), h ) );
+    CUDART_CHECK( cudaMallocPitch( (void **) &dSumIT, &CorrPitch, w*sizeof(int), h ) );
+
+    CUDART_CHECK( cudaMemset( dCorr, 0, CorrPitch*h ) );
+    CUDART_CHECK( cudaMemset( dSumI, 0, CorrPitch*h ) );
+    CUDART_CHECK( cudaMemset( dSumISq, 0, CorrPitch*h ) );
+    CUDART_CHECK( cudaMemset( dSumIT, 0, CorrPitch*h ) );
+
+    CUDART_CHECK( cudaEventCreate( &start, 0 ) );
+    CUDART_CHECK( cudaEventCreate( &stop, 0 ) );
+
+    pfnCorrelationSums(
+        dCorr, CorrPitch,
+        dSumI, dSumISq, dSumIT,
+        wTile,
+        wTemplate, hTemplate, 
+        cPixels, fDenomExp, 
+        sharedPitch, 
+        xOffset, yOffset, 
+        xTemplate, yTemplate, 
+        0, 0, w, h,
+        threads, blocks, sharedMem );
+
+    CUDART_CHECK( cudaMemcpy2D( hSumI, w*sizeof(int), dSumI, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
+    CUDART_CHECK( cudaMemcpy2D( hSumISq, w*sizeof(int), dSumISq, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
+    CUDART_CHECK( cudaMemcpy2D( hSumIT, w*sizeof(int), dSumIT, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
+
+    if ( bCompareSums( hSumI, hSumISq, hSumIT,
+                       hrefSumI, hrefSumISq, hrefSumIT,
+                       w, h ) ) {
+        //CH_ASSERT(0);
+        printf( "Sums miscompare\n" );
+        goto Error;
+    }
+
+    CUDART_CHECK( cudaMemcpy2D( hCorr, w*sizeof(float), dCorr, CorrPitch, w*sizeof(float), h, cudaMemcpyDeviceToHost ) );
+
+    if ( bCompareCorrValues( hrefCorr, hCorr, w, h ) ) {
+        //CH_ASSERT(0);
+        printf( "Correlation coefficients generated by sums kernel mismatch\n" );
+        return 1;
+    }
+
+    CUDART_CHECK( cudaMemset2D( dCorr, CorrPitch, 0, w*sizeof(float), h ) );
+    CUDART_CHECK( cudaDeviceSynchronize() );
+    CUDART_CHECK( cudaEventRecord( start, 0 ) );
+
+    for ( int i = 0; i < cIterations; i++ ) {
+        pfnCorrelation( 
+            dCorr, CorrPitch, 
+            wTile, 
+            wTemplate, hTemplate, 
+            cPixels, fDenomExp, 
+            sharedPitch, 
+            xOffset, yOffset,
+            xTemplate, yTemplate, 
+            0, 0, w, h, 
+            threads, blocks, sharedMem );
+    }
+
+    CUDART_CHECK( cudaEventRecord( stop, 0 ) );
+    CUDART_CHECK( cudaMemcpy2D( hCorr, w*sizeof(float), dCorr, CorrPitch, w*sizeof(float), h, cudaMemcpyDeviceToHost ) );
+
+    if ( bCompareCorrValues( hrefCorr, hCorr, w, h ) ) {
+        CH_ASSERT(0);
+        printf( "Correlation coefficients generated by coefficient-only kernel mismatch\n" );
+        return 1;
+    }
+
+    {
+        float ms;
+        CUDART_CHECK( cudaEventElapsedTime( &ms, start, stop ) );
+        *pixelsPerSecond = (double) w*h*cIterations*1000.0 / ms;
+        *templatePixelsPerSecond = *pixelsPerSecond*wTemplate*hTemplate;
+    }
+
+    if ( bPrintNeighborhood ) {
+        printf( "\nNeighborhood around template:\n" );
+        for ( int VertOffset = -4; VertOffset <= 4; VertOffset++ ) {
+            const float *py = hrefCorr+w*(VertOffset+yTemplate);
+            for ( int HorzOffset = -4; HorzOffset <= 4; HorzOffset++ ) {
+                printf( "%6.2f", py[xTemplate+HorzOffset] );
+            }
+            printf("\n");
+        }
+    }
+
+    if ( outputFilename ) {
+        unsigned char *correlationValues = (unsigned char *) malloc( w*h );
+        if ( ! correlationValues ) {
+            status = cudaErrorMemoryAllocation;
+            goto Error;
+        }
+        for ( int row = 0; row < h; row++ ) {
+            for ( int col = 0; col < w; col++ ) {
+                int index = row*w+col;
+                float value = hCorr[index] < 0.0f ? 0.0f : logf( 1.0f+hCorr[index] )/logf(2.0f);
+                if ( value < 0.5f ) value = 0.0f;
+                value = 2.0f * (value - 0.5f);
+                correlationValues[index] = (unsigned char) (255.0f*value+0.5f);
+            }
+        }
+        if ( 0 != pgmSave( outputFilename, correlationValues, w, h ) ) {
+            status = cudaErrorUnknown;
+            goto Error;
+        }
+        free( correlationValues );
+    }
+
+    ret = true;
+
+Error:
+    cudaEventDestroy( start );
+    cudaEventDestroy( stop );
+    free( hCorr );
+    free( hSumI );
+    free( hSumISq );
+    free( hSumIT );
+    if ( dCorr ) cudaFree( dCorr );
+    if ( dSumI ) cudaFree( dSumI );
+    if ( dSumI ) cudaFree( dSumISq );
+    if ( dSumI ) cudaFree( dSumIT );
+    return ret;
+}
+
+int
+main(int argc, char *argv[])
+{
+    int ret = 1;
+    cudaError_t status;
+
+    unsigned char *hidata = NULL;
+    unsigned char *didata = NULL;
+    float *hoCorrCPU = NULL;
+    
+    int *hoCorrCPUI = NULL;
+    int *hoCorrCPUISq = NULL;
+    int *hoCorrCPUIT = NULL;
+    unsigned int HostPitch, DevicePitch;
+    int w, h;
+
+    int wTemplate = 52;
+    int hTemplate = 52;
+    int xOffset, yOffset;
+
+    int xTemplate = 210;
+    int yTemplate = 148;
+
+    int wTile;
+    dim3 threads;
+    dim3 blocks;
+
+    int sharedPitch;
+    int sharedMem;
+    char *inputFilename = "coins.pgm";
+    char *outputFilename = NULL;
+
+    cudaArray *pArrayImage = NULL;
+    cudaArray *pArrayTemplate = NULL;
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+
+    if ( chCommandLineGetBool( "help", argc, argv ) ) {
+        printf( "Usage:\n" );
+        printf( "    --input <filename>: specify input filename (must be PGM)\n" );
+        printf( "    --output <filename>: Write PGM of correlation values (0..255) to <filename>.\n" );
+        printf( "    --padWidth <value>: pad input image width to specified value\n" );
+        printf( "    --padHeight <value>: pad input image height to specified value\n" );
+        printf( "    --xTemplate <value>: X coordinate of upper left corner of template\n" );
+        printf( "    --yTemplate <value>: Y coordinate of upper left corner of template\n" );
+        printf( "    --wTemplate <value>: Width of template\n" );
+        printf( "    --hTemplate <value>: Height of template\n" );
+        printf( "\nDefault values are coins.pgm, no output file or padding, and template of the dime in the\n" );
+        printf("lower right corner of coins.pgm: xTemplate=210, yTemplate=148, wTemplate=hTemplate=52\n" );
+
+        return 0;
+    }
+
+    CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
+    CUDART_CHECK( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) );
+
+    if ( chCommandLineGet( &inputFilename, "input", argc, argv ) ) {
+        printf( "Reading from image file %s\n", inputFilename );
+    }
+    chCommandLineGet( &outputFilename, "output", argc, argv );
+    {
+        int padWidth = 0;
+        int padHeight = 0;
+        if ( chCommandLineGet( &padWidth, "padWidth", argc, argv ) ) {
+            if ( ! chCommandLineGet( &padHeight, "padHeight", argc, argv ) ) {
+                printf( "Must specify both --padWidth and --padHeight\n" );
+                goto Error;
+            }
+        }
+        else {
+            if ( chCommandLineGet( &padHeight, "padHeight", argc, argv ) ) {
+                printf( "Must specify both --padWidth and --padHeight\n" );
+                goto Error;
+            }
+        }
+        if ( pgmLoad(inputFilename, &hidata, &HostPitch, &didata, &DevicePitch, &w, &h, padWidth, padHeight) )
+            goto Error;
+    }
+    chCommandLineGet( &xTemplate, "xTemplate", argc, argv );
+    chCommandLineGet( &yTemplate, "yTemplate", argc, argv );
+    chCommandLineGet( &wTemplate, "wTemplate", argc, argv );
+    chCommandLineGet( &hTemplate, "hTemplate", argc, argv );
+
+    xOffset = -wTemplate/2;
+    yOffset = -wTemplate/2;
+
+    hoCorrCPU = (float *) malloc(w*h*sizeof(float)); if ( ! hoCorrCPU ) return 1;
+    hoCorrCPUI = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUI ) return 1;
+    hoCorrCPUISq = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUISq ) return 1;
+    hoCorrCPUIT = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUIT ) return 1;
+    if ( NULL == hoCorrCPU ||
+         NULL == hoCorrCPUI ||
+         NULL == hoCorrCPUISq ||
+         NULL == hoCorrCPUIT )
+        goto Error;
+
+    CUDART_CHECK( cudaMallocArray( &pArrayImage, &desc, w, h ) );
+    CUDART_CHECK( cudaMallocArray( &pArrayTemplate, &desc, w, h ) );
+    CUDART_CHECK( cudaMemcpyToArray( pArrayImage, 0, 0, hidata, w*h, cudaMemcpyHostToDevice ) );
+        
+    CUDART_CHECK( cudaMemcpy2DArrayToArray( pArrayTemplate, 0, 0, pArrayImage, 0, 0, w, h, cudaMemcpyDeviceToDevice ) );
+    
+    CUDART_CHECK( cudaBindTextureToArray( texImage, pArrayImage ) );
+    CUDART_CHECK( cudaBindTextureToArray( texTemplate, pArrayTemplate ) );
+
+    CopyToTemplate( didata, DevicePitch, 
+                    xTemplate, yTemplate, 
+                    wTemplate, hTemplate,
+                    xOffset, yOffset );
+
+    corrCPU( hoCorrCPU, hoCorrCPUI, hoCorrCPUISq, hoCorrCPUIT, 
+        w*sizeof(float), wTemplate*hTemplate, xTemplate-xOffset, yTemplate-yOffset, w, h, 
+        hidata, HostPitch, hidata, HostPitch );
+
+    // height of thread block must be >= hTemplate
+    wTile = 32;
+    threads = dim3(32,8);
+    blocks = dim3(w/wTile+(0!=w%wTile),h/threads.y+(0!=h%threads.y));
+
+    sharedPitch = ~63&(wTile+wTemplate+63);
+    sharedMem = sharedPitch*(threads.y+hTemplate);
+
+#define TEST_VECTOR( baseName, bPrintNeighborhood, cIterations, outfile ) \
+    { \
+        double pixelsPerSecond; \
+        double templatePixelsPerSecond; \
+        if ( ! TestCorrelation( &pixelsPerSecond, \
+            &templatePixelsPerSecond, \
+            xOffset, yOffset, \
+            w, h,  \
+            hoCorrCPU, \
+            hoCorrCPUI, \
+            hoCorrCPUISq, \
+            hoCorrCPUIT, \
+            xTemplate-xOffset, yTemplate-yOffset, \
+            wTemplate, hTemplate, \
+            wTile, sharedPitch, sharedMem, \
+            threads, blocks,  \
+            baseName##Sums,  \
+            baseName, \
+            bPrintNeighborhood, cIterations, outfile ) ) { \
+            printf( "Error\n" ); \
+        } \
+        printf( "%s: %.2f Mpix/s\t%.2fGtpix/s\n", \
+            #baseName, pixelsPerSecond/1e6, templatePixelsPerSecond/1e9 ); \
+    }
+
+    TEST_VECTOR( corrShared, false, 100, NULL );
+
+    // height of thread block must be >= hTemplate
+    wTile = 32;
+    threads = dim3(32,8);
+    blocks = dim3(w/wTile+(0!=w%wTile),h/threads.y+(0!=h%threads.y));
+
+    sharedPitch = ~63&(((wTile+wTemplate)+63));
+    sharedMem = sharedPitch*(threads.y+hTemplate);
+
+    TEST_VECTOR( corrSharedSM, false, 100, NULL );
+
+    TEST_VECTOR( corrShared4, false, 100, NULL );
+
+    // set up blocking parameters for 2D tex-constant formulation
+    threads.x = 32; threads.y = 16; threads.z = 1;
+    blocks.x = INTCEIL(w,threads.x); blocks.y = INTCEIL(h,threads.y); blocks.z = 1;
+    TEST_VECTOR( corrTexConstant, false, 100, NULL );
+
+    if ( outputFilename ) {
+        printf( "Writing graymap of correlation values to %s\n", outputFilename );
+    }
+
+    // set up blocking parameters for 2D tex-tex formulation
+    threads.x = 16; threads.y = 8; threads.z = 1;
+    blocks.x = INTCEIL(w,threads.x); blocks.y = INTCEIL(h,threads.y); blocks.z = 1;
+    TEST_VECTOR( corrTexTex, false, 100, outputFilename );
+
+    ret = 0;
+Error:
+    free( hoCorrCPU );
+    free( hoCorrCPUI );
+    free( hoCorrCPUISq );
+    free( hoCorrCPUIT );
+
+    free( hidata );
+
+    cudaFree(didata); 
+
+    cudaFreeArray(pArrayImage);
+    cudaFreeArray(pArrayTemplate);
+   
+    return ret;
+
+}
--- a/corr/pgm.cu
+++ b/corr/pgm.cu
@ -0,0 +1,133 @@
				@@ -0,0 +1,133 @@
+/*
+ *
+ * pgm.cu
+ *
+ * Functions to load and store PGM (portable gray map) files.
+ *
+ * Copyright (c) 2011-2012, Archaea Software, LLC.
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in 
+ *    the documentation and/or other materials provided with the 
+ *    distribution. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "pgm.h"
+
+int
+pgmLoad(
+    const char *filename, 
+    unsigned char **pHostData, unsigned int *pHostPitch, 
+    unsigned char **pDeviceData, unsigned int *pDevicePitch,
+    int *pWidth, int *pHeight, int padWidth, int padHeight)
+{
+    int ret = 1;
+    const int hsize = 0x40;
+    int w, h;
+    FILE *fp = NULL;
+    int maxval;
+    char header[hsize];
+    unsigned char *idata = NULL;
+    unsigned char *ddata = NULL;
+    size_t dPitch;
+
+    fp = fopen( filename, "rb" );
+    if ( fp == NULL) {
+        fprintf( stderr, "Failed to open %s.\n", filename );
+        goto Error;
+    }
+
+    if (NULL == fgets(header, hsize, fp)) {
+        fprintf(stderr, "Invalid PGM file.\n");
+        goto Error;
+    }
+
+    if ( strncmp(header, "P5", 2) ) {
+        fprintf(stderr, "File is not a PGM image.\n");
+        goto Error;
+    }
+    if ( 1 != fscanf( fp, "%d", &w ) )
+        goto Error;
+    if ( 1 != fscanf( fp, "%d", &h ) )
+        goto Error;
+    if ( 1 != fscanf( fp, "%d", &maxval ) )
+        goto Error;
+    if ( padWidth == 0 && padHeight == 0 ) {
+        padWidth = w;
+        padHeight = h;
+    }
+    idata = (unsigned char *) malloc( padWidth*padHeight );
+    if ( ! idata )
+        goto Error;
+/*    if (((size_t)w*h) != fread(idata, sizeof(unsigned char), w*h, fp))
+        goto Error;*/
+    for ( int row = 0; row < h; row++ ) {
+        if ( (size_t) w != fread( idata+row*padWidth, 1, w, fp ) )
+            goto Error;
+    }
+    if ( cudaSuccess != cudaMallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) )
+        goto Error;
+    *pWidth = padWidth;
+    *pHeight = padHeight;
+    *pHostPitch = padWidth;
+    *pHostData = idata;
+    *pDeviceData = ddata;
+    *pDevicePitch = (unsigned int) dPitch;
+    cudaMemcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice );
+    fclose(fp);
+    return 0;
+Error:
+    free( idata );
+    cudaFree( ddata );
+    if ( fp ) {
+        fclose( fp );
+    }
+    return ret;
+}
+
+int
+pgmSave(const char* filename, unsigned char *data, int w, int h)
+{
+    int ret = 1;
+    FILE *fp = fopen( filename, "wb" );
+    if ( NULL == fp ) {
+        fprintf( stderr, "Failed to open %s\n", filename );
+        goto Error;
+    }
+
+    fprintf( fp, "P5\n%d\n%d\n%d\n", w, h, 0xff );
+    if ( w*h != fwrite(data, sizeof(unsigned char), w*h, fp) ) {
+        fprintf( stderr, "Write failed\n" );
+        goto Error;
+    }
+
+    fclose(fp);
+    ret = 0;
+Error:
+    return ret;
+}
--- a/corr/pgm.h
+++ b/corr/pgm.h
@ -0,0 +1,10 @@
				@@ -0,0 +1,10 @@
+#ifndef _IMAGE_H_
+#define _IMAGE_H_
+
+int pgmLoad( const char *filename, 
+             unsigned char **pHostData, unsigned int *pSysPitch,
+             unsigned char **pDeviceData, unsigned int *pDevPitch,
+              int *pWidth, int *pHeight, int padWidth=0, int padHeight=0 );
+int pgmSave( const char *filename, unsigned char *data, int w, int h);
+
+#endif
--- a/corr/values.bmp
+++ b/corr/values.bmp