Browse Source

Add normalized correlation code

pull/1/head
Nicholas Wilt 13 years ago
parent
commit
40457387da
  1. BIN
      corr/coins.bmp
  2. 5
      corr/coins.pgm
  3. 150
      corr/corrShared.cuh
  4. 186
      corr/corrShared4.cuh
  5. 217
      corr/corrShared4Sums.cuh
  6. 171
      corr/corrSharedSM.cuh
  7. 183
      corr/corrSharedSMSums.cuh
  8. 161
      corr/corrSharedSums.cuh
  9. 94
      corr/corrTexConstant.cuh
  10. 105
      corr/corrTexConstantSums.cuh
  11. 101
      corr/corrTexTex.cuh
  12. 115
      corr/corrTexTexSums.cuh
  13. 641
      corr/normalizedCrossCorrelation.cu
  14. 133
      corr/pgm.cu
  15. 10
      corr/pgm.h
  16. BIN
      corr/values.bmp

BIN
corr/coins.bmp

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

5
corr/coins.pgm

File diff suppressed because one or more lines are too long

150
corr/corrShared.cuh

@ -0,0 +1,150 @@ @@ -0,0 +1,150 @@
/*
*
* corrShared.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
__global__ void
corrShared_kernel(
float *pCorr, size_t CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++) {
unsigned char I = LocalBlock[SharedIdx+i];
unsigned char T = g_Tpix[idx++];
SumI += I;
SumISq += I*I;
SumIT += I*T;
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
}
}
__syncthreads();
}
void
corrShared(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexConstant(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
corrShared_kernel<<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
Error:
return;
}

186
corr/corrShared4.cuh

@ -0,0 +1,186 @@ @@ -0,0 +1,186 @@
/*
*
* corrShared.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory. This implementation includes the
* SM-aware optimizations of corrSharedSMSums.cuh and also
* unrolls the innermost loop 4x.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
template<bool bSM1>
__global__ void
corrShared4_kernel(
float *pCorr, size_t CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate/4; i++) {
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+0], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+1], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+2], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+3], g_Tpix[idx++] );
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
}
}
__syncthreads();
}
void
corrShared4(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexConstant(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
if ( wTemplate % 4 ) {
if ( props.major == 1 ) {
corrSharedSM_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrSharedSM_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
}
if ( props.major == 1 ) {
corrShared4_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrShared4_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
Error:
return;
}

217
corr/corrShared4Sums.cuh

@ -0,0 +1,217 @@ @@ -0,0 +1,217 @@
/*
*
* corrShared4Sums.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory, and reports the sums (I, IT, ISq)
* as well as the output coefficients. This implementation
* includes the SM-aware optimizations of corrSharedSMSums.cuh
* and also unrolls the innermost loop 4x.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
template<bool bSM1>
__device__ void
corrSharedAccumulate(
int& SumI, int& SumISq, int& SumIT,
unsigned char I, unsigned char T )
{
SumI += I;
if ( bSM1 ) {
SumISq += __umul24(I,I);
SumIT += __umul24(I,T);
}
else {
SumISq += I*I;
SumIT += I*T;
}
}
template<bool bSM1>
__global__ void
corrShared4Sums_kernel(
float *pCorr, size_t CorrPitch,
int *pI, int *pISq, int *pIT,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
pI = (int *) (((char *) pI)+v*CorrPitch);
pISq = (int *) (((char *) pISq)+v*CorrPitch);
pIT = (int *) (((char *) pIT)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate/4; i++) {
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+0], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+1], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+2], g_Tpix[idx++] );
corrSharedAccumulate<bSM1>( SumI, SumISq, SumIT, LocalBlock[SharedIdx+i*4+3], g_Tpix[idx++] );
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pI[uTile+col] = SumI;
pISq[uTile+col] = SumISq;
pIT[uTile+col] = SumIT;
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
}
}
__syncthreads();
}
void
corrShared4Sums(
float *dCorr, int CorrPitch,
int *dSumI, int *dSumISq, int *dSumIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexConstantSums(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
if ( wTemplate%4 ) {
if ( props.major == 1 ) {
corrSharedSMSums_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrSharedSMSums_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
}
if ( props.major == 1 ) {
corrShared4Sums_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrShared4Sums_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
Error:
return;
}

171
corr/corrSharedSM.cuh

@ -0,0 +1,171 @@ @@ -0,0 +1,171 @@
/*
*
* corrSharedSM.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory. This implementation is SM-aware;
* it launches different kernels depending on whether it's
* running on SM 1.x hardware.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
template<bool bSM1>
__global__ void
corrSharedSM_kernel(
float *pCorr, size_t CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++) {
unsigned char I = LocalBlock[SharedIdx+i];
unsigned char T = g_Tpix[idx++];
SumI += I;
if ( bSM1 ) {
SumISq += __umul24(I, I);
SumIT += __umul24(I, T);
}
else {
SumISq += I*I;
SumIT += I*T;
}
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
}
}
__syncthreads();
}
void
corrSharedSM(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexConstant(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
if ( props.major == 1 ) {
corrSharedSM_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrSharedSM_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
Error:
return;
}

183
corr/corrSharedSMSums.cuh

@ -0,0 +1,183 @@ @@ -0,0 +1,183 @@
/*
*
* corrSharedSums.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory, and reports the sums (I, IT, ISq)
* as well as the output coefficients. This implementation
* is SM-aware; it launches different kernels depending on
* whether it's running on SM 1.x hardware.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
template<bool bSM1>
__global__ void
corrSharedSMSums_kernel(
float *pCorr, size_t CorrPitch,
int *pI, int *pISq, int *pIT,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
pI = (int *) (((char *) pI)+v*CorrPitch);
pISq = (int *) (((char *) pISq)+v*CorrPitch);
pIT = (int *) (((char *) pIT)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++) {
unsigned char I = LocalBlock[SharedIdx+i];
unsigned char T = g_Tpix[idx++];
SumI += I;
if ( bSM1 ) {
SumISq += __umul24( I, I );
SumIT += __umul24( I, T );
}
else {
SumISq += I*I;
SumIT += I*T;
}
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pI[uTile+col] = SumI;
pISq[uTile+col] = SumISq;
pIT[uTile+col] = SumIT;
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
}
}
__syncthreads();
}
void
corrSharedSMSums(
float *dCorr, int CorrPitch,
int *dSumI, int *dSumISq, int *dSumIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexTexSums(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
if ( props.major == 1 ) {
corrSharedSMSums_kernel<true><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
else {
corrSharedSMSums_kernel<false><<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
}
Error:
return;
}

161
corr/corrSharedSums.cuh

@ -0,0 +1,161 @@ @@ -0,0 +1,161 @@
/*
*
* corrSharedSums.cuh
*
* Header file for implementation of normalized correlation
* that reads the image from shared memory and the template
* from constant memory, and reports the sums (I, IT, ISq)
* as well as the output coefficients.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
extern __shared__ unsigned char LocalBlock[];
__global__ void
corrSharedSums_kernel(
float *pCorr, size_t CorrPitch,
int *pI, int *pISq, int *pIT,
int wTile,
int wTemplate, int hTemplate,
float xOffset, float yOffset,
float cPixels, float fDenomExp, int SharedPitch,
float xUL, float yUL, int w, int h )
{
int uTile = blockIdx.x*wTile;
int vTile = blockIdx.y*blockDim.y;
int v = vTile + threadIdx.y;
float *pOut = (float *) (((char *) pCorr)+v*CorrPitch);
pI = (int *) (((char *) pI)+v*CorrPitch);
pISq = (int *) (((char *) pISq)+v*CorrPitch);
pIT = (int *) (((char *) pIT)+v*CorrPitch);
for ( int row = threadIdx.y;
row < blockDim.y+hTemplate;
row += blockDim.y ) {
int SharedIdx = row * SharedPitch;
for ( int col = threadIdx.x;
col < wTile+wTemplate;
col += blockDim.x ) {
LocalBlock[SharedIdx+col] =
tex2D( texImage,
(float) (uTile+col+xUL+xOffset),
(float) (vTile+row+yUL+yOffset) );
}
}
__syncthreads();
for ( int col = threadIdx.x;
col < wTile;
col += blockDim.x ) {
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int idx = 0;
int SharedIdx = threadIdx.y * SharedPitch + col;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++) {
unsigned char I = LocalBlock[SharedIdx+i];
unsigned char T = g_Tpix[idx++];
SumI += I;
SumISq += I*I;
SumIT += I*T;
}
SharedIdx += SharedPitch;
}
if ( uTile+col < w && v < h ) {
pI[uTile+col] = SumI;
pISq[uTile+col] = SumISq;
pIT[uTile+col] = SumIT;
pOut[uTile+col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, g_fDenomExp );
}
}
__syncthreads();
}
void
corrSharedSums(
float *dCorr, int CorrPitch,
int *dSumI, int *dSumISq, int *dSumIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
int device;
cudaDeviceProp props;
cudaError_t status;
CUDART_CHECK( cudaGetDevice( &device ) );
CUDART_CHECK( cudaGetDeviceProperties( &props, device ) );
if ( sharedMem > props.sharedMemPerBlock ) {
dim3 tcThreads(32, 16, 1);
dim3 tcBlocks;
tcBlocks.x = INTCEIL(w,threads.x);
tcBlocks.y = INTCEIL(h,threads.y);
tcBlocks.z = 1;
return corrTexConstantSums(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
cPixels,
fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
xUL, yUL, w, h,
tcThreads, tcBlocks,
sharedMem );
}
corrSharedSums_kernel<<<blocks, threads, sharedMem>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
(float) xOffset, (float) yOffset,
cPixels, fDenomExp,
sharedPitch,
(float) xUL, (float) yUL, w, h );
Error:
return;
}

94
corr/corrTexConstant.cuh

@ -0,0 +1,94 @@ @@ -0,0 +1,94 @@
/*
*
* corrTexConstant.cuh
*
* Header file for 2D implementation of normalized correlation
* that reads the template from constant memory.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
__global__ void
corrTexConstant_kernel(
float *pCorr, size_t CorrPitch,
float cPixels, float fDenomExp,
float xUL, float yUL, int w, int h,
int xOffset, int yOffset,
int wTemplate, int hTemplate )
{
size_t row = blockIdx.y*blockDim.y + threadIdx.y;
size_t col = blockIdx.x*blockDim.x + threadIdx.x;
// adjust pointers to row
pCorr = (float *) ((char *) pCorr+row*CorrPitch);
// No __syncthreads in this kernel, so we can early-out
// without worrying about the effects of divergence.
if ( col >= w || row >= h )
return;
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int inx = 0;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++ ) {
unsigned char I = tex2D( texImage,
(float) col+xUL+xOffset+i,
(float) row+yUL+yOffset+j );
unsigned char T = g_Tpix[inx++];
SumI += I;
SumISq += I*I;
SumIT += I*T;
}
}
pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
}
void
corrTexConstant(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
corrTexConstant_kernel<<<blocks, threads>>>(
dCorr, CorrPitch,
cPixels, fDenomExp,
(float) xUL, (float) yUL, w, h,
xOffset, yOffset, wTemplate, hTemplate );
}

105
corr/corrTexConstantSums.cuh

@ -0,0 +1,105 @@ @@ -0,0 +1,105 @@
/*
*
* corrTexConstant.cuh
*
* Header file for implementation of normalized correlation
* that reads the template from constant memory, and reports
* the sums (I, IT, ISq) as well as the output coefficients.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
__global__ void
corrTexConstantSums_kernel(
float *pCorr, size_t CorrPitch,
int *pI, int *pISq, int *pIT,
float cPixels, float fDenomExp,
float xUL, float yUL, int w, int h,
int xOffset, int yOffset,
int wTemplate, int hTemplate )
{
size_t row = blockIdx.y*blockDim.y + threadIdx.y;
size_t col = blockIdx.x*blockDim.x + threadIdx.x;
// adjust pointers to row
pCorr = (float *) ((char *) pCorr+row*CorrPitch);
pI = (int *) ((char *) pI +row*CorrPitch);
pISq = (int *) ((char *) pISq+row*CorrPitch);
pIT = (int *) ((char *) pIT +row*CorrPitch);
// No __syncthreads in this kernel, so we can early-out
// without worrying about the effects of divergence.
if ( col >= w || row >= h )
return;
int SumI = 0;
int SumISq = 0;
int SumIT = 0;
int inx = 0;
for ( int j = 0; j < hTemplate; j++ ) {
for ( int i = 0; i < wTemplate; i++ ) {
unsigned char I = tex2D( texImage,
(float) col+xUL+xOffset+i,
(float) row+yUL+yOffset+j );
unsigned char T = g_Tpix[inx++];
SumI += I;
SumISq += I*I;
SumIT += I*T;
}
}
pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, g_SumT, cPixels, fDenomExp );
pI[col] = SumI;
pISq[col] = SumISq;
pIT[col] = SumIT;
}
void
corrTexConstantSums(
float *dCorr, int CorrPitch,
int *dSumI, int *dSumISq, int *dSumIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
corrTexConstantSums_kernel<<<blocks, threads>>>(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
cPixels, fDenomExp,
(float) xUL, (float) yUL, w, h,
xOffset, yOffset, wTemplate, hTemplate );
}

101
corr/corrTexTex.cuh

@ -0,0 +1,101 @@ @@ -0,0 +1,101 @@
/*
*
* corrTexTex.cuh
*
* Header file for implementation of normalized correlation
* that reads both image and template from texture.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
__global__ void
corrTexTex_kernel(
float *pCorr, size_t CorrPitch,
float cPixels,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int wTemplate, int hTemplate,
float xUL, float yUL, int w, int h )
{
size_t row = blockIdx.y*blockDim.y + threadIdx.y;
size_t col = blockIdx.x*blockDim.x + threadIdx.x;
// adjust pCorr to point to row
pCorr = (float *) ((char *) pCorr+row*CorrPitch);
// No __syncthreads in this kernel, so we can early-out
// without worrying about the effects of divergence.
if ( col >= w || row >= h )
return;
int SumI = 0;
int SumT = 0;
int SumISq = 0;
int SumTSq = 0;
int SumIT = 0;
for ( int y = 0; y < hTemplate; y++ ) {
for ( int x = 0; x < wTemplate; x++ ) {
unsigned char I = tex2D( texImage,
(float) col+xUL+xOffset+x, (float) row+yUL+yOffset+y );
unsigned char T = tex2D( texTemplate,
(float) xTemplate+x, (float) yTemplate+y);
SumI += I;
SumT += T;
SumISq += I*I;
SumTSq += T*T;
SumIT += I*T;
}
float fDenomExp = (float)
( (double) cPixels*SumTSq - (double) SumT*SumT);
pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, SumT, cPixels, fDenomExp );
}
}
void
corrTexTex(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
corrTexTex_kernel<<<blocks, threads>>>(
dCorr, CorrPitch,
cPixels,
xOffset, yOffset,
xTemplate+xOffset, yTemplate+yOffset,
wTemplate, hTemplate,
(float) xUL, (float) yUL, w, h );
}

115
corr/corrTexTexSums.cuh

@ -0,0 +1,115 @@ @@ -0,0 +1,115 @@
/*
*
* corrTexTexSums.cuh
*
* Header file for implementation of normalized correlation
* that reads the template from texture, and reports the
* sums (I, IT, ISq) in addition to the coefficients.
*
* Copyright (c) 2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
__global__ void
corrTexTexSums_kernel(
float *pCorr, size_t CorrPitch,
int *pI, int *pISq, int *pIT,
float cPixels,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int wTemplate, int hTemplate,
float xUL, float yUL, int w, int h )
{
size_t row = blockIdx.y*blockDim.y + threadIdx.y;
size_t col = blockIdx.x*blockDim.x + threadIdx.x;
// adjust pointers to row
pCorr = (float *) ((char *) pCorr+row*CorrPitch);
pI = (int *) ((char *) pI +row*CorrPitch);
pISq = (int *) ((char *) pISq+row*CorrPitch);
pIT = (int *) ((char *) pIT +row*CorrPitch);
// No __syncthreads in this kernel, so we can early-out
// without worrying about the effects of divergence.
if ( col >= w || row >= h )
return;
int SumI = 0;
int SumT = 0;
int SumISq = 0;
int SumTSq = 0;
int SumIT = 0;
for ( int y = 0; y < hTemplate; y++ ) {
for ( int x = 0; x < wTemplate; x++ ) {
unsigned char I =
tex2D( texImage,
(float) col+xUL+xOffset+x,
(float) row+yUL+yOffset+y );
unsigned char T =
tex2D( texTemplate,
(float) xTemplate+x,
(float) yTemplate+y);
SumI += I;
SumT += T;
SumISq += I*I;
SumTSq += T*T;
SumIT += I*T;
}
float fDenomExp = (float)
((double) cPixels*SumTSq - (double) SumT*SumT);
pI[col] = SumI;
pISq[col] = SumISq;
pIT[col] = SumIT;
pCorr[col] = CorrelationValue( SumI, SumISq, SumIT, SumT, cPixels, fDenomExp );
}
}
void
corrTexTexSums(
float *dCorr, int CorrPitch,
int *pI, int *pISq, int *pIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem )
{
corrTexTexSums_kernel<<<blocks, threads>>>(
dCorr, CorrPitch,
pI, pISq, pIT,
cPixels,
xOffset, yOffset,
xTemplate+xOffset, yTemplate+yOffset,
wTemplate, hTemplate,
(float) xUL, (float) yUL, w, h );
}

641
corr/normalizedCrossCorrelation.cu

@ -0,0 +1,641 @@ @@ -0,0 +1,641 @@
/*
*
* normalizedCrossCorrelation.cu
*
* Microbenchmark for normalized cross correlation, a template-
* matching algorithm for computer vision.
*
* Build with: nvcc -I ../chLib <options> normalizedCrossCorrelation.cu pgm.cu
*
* Make sure to include pgm.cu for the image file I/O support.
*
* To avoid warnings about double precision support, specify the
* target gpu-architecture, e.g.:
* nvcc --gpu-architecture sm_13 -I ../chLib <options> normalizedCrossCorrelation.cu pgm.cu
*
* Requires: No minimum SM requirement.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <chError.h>
#include <chCommandLine.h>
#include <chAssert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <assert.h>
#include "pgm.h"
texture<unsigned char, 2> texImage;
texture<unsigned char, 2> texTemplate;
const int maxTemplatePixels = 3072;
__constant__ int g_xOffset[maxTemplatePixels];
__constant__ int g_yOffset[maxTemplatePixels];
__constant__ unsigned char g_Tpix[maxTemplatePixels];
__constant__ float g_cPixels, g_SumT, g_fDenomExp;
unsigned int g_cpuSumT, g_cpuSumTSq;
const float fThreshold = 1e-3f;
#define INTCEIL(a,b) ( ((a)+(b)-1) / (b) )
__device__ __host__ inline float
CorrelationValue( float SumI, float SumISq, float SumIT, float SumT, float cPixels, float fDenomExp )
{
float Numerator = cPixels*SumIT - SumI*SumT;
float Denominator = rsqrtf( (cPixels*SumISq - SumI*SumI)*fDenomExp );
return Numerator * Denominator;
}
#include "corrTexTexSums.cuh"
#include "corrTexTex.cuh"
#include "corrTexConstantSums.cuh"
#include "corrTexConstant.cuh"
extern __shared__ unsigned char LocalBlock[];
#include "corrSharedSMSums.cuh"
#include "corrSharedSM.cuh"
#include "corrSharedSums.cuh"
#include "corrShared.cuh"
#include "corrShared4Sums.cuh"
#include "corrShared4.cuh"
int poffsetx[maxTemplatePixels];
int poffsety[maxTemplatePixels];
cudaError_t
CopyToTemplate(
unsigned char *img, size_t imgPitch,
int xTemplate, int yTemplate,
int wTemplate, int hTemplate,
int OffsetX, int OffsetY
)
{
cudaError_t status;
unsigned char pixels[maxTemplatePixels];
int inx = 0;
int SumT = 0;
int SumTSq = 0;
int cPixels = wTemplate*hTemplate;
size_t sizeOffsets = cPixels*sizeof(int);
float fSumT, fDenomExp, fcPixels;
CUDART_CHECK(
cudaMemcpy2D(
pixels, wTemplate,
img+yTemplate*imgPitch+xTemplate, imgPitch,
wTemplate, hTemplate,
cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaMemcpyToSymbol( g_Tpix, pixels, cPixels ) );
for ( int i = OffsetY; i < OffsetY+hTemplate; i++ ) {
for ( int j = OffsetX; j < OffsetX+wTemplate; j++) {
SumT += pixels[inx];
SumTSq += pixels[inx]*pixels[inx];
poffsetx[inx] = j;
poffsety[inx] = i;
inx += 1;
}
}
g_cpuSumT = SumT;
g_cpuSumTSq = SumTSq;
CUDART_CHECK( cudaMemcpyToSymbol(g_xOffset, poffsetx, sizeOffsets) );
CUDART_CHECK( cudaMemcpyToSymbol(g_yOffset, poffsety, sizeOffsets) );
fSumT = (float) SumT;
CUDART_CHECK( cudaMemcpyToSymbol(g_SumT, &fSumT, sizeof(float)) );
fDenomExp = (float) ( (double)cPixels*SumTSq - (double) SumT*SumT);
CUDART_CHECK( cudaMemcpyToSymbol(g_fDenomExp, &fDenomExp, sizeof(float)) );
fcPixels = (float) cPixels;
CUDART_CHECK( cudaMemcpyToSymbol(g_cPixels, &fcPixels, sizeof(float)) );
Error:
return status;
}
int
bCompareCorrValues( const float *pBase0,
const float *pBase1,
int w, int h )
{
for ( int j = 0; j < h; j++ ) {
float *pf0 = (float *) ((char *) pBase0+j*w*sizeof(float));
float *pf1 = (float *) ((char *) pBase1+j*w*sizeof(float));
for ( int i = 0; i < w; i++ ) {
if ( fabsf(pf0[i]-pf1[i]) > fThreshold ) {
printf( "Mismatch pf0[%d] = %.5f, pf1[%d] = %.5f\n", i, pf0[i], i, pf1[i] );
fflush( stdout );
//CH_ASSERT(0);
return 1;
}
}
}
return 0;
}
int
bCompareSums( const int *pBaseI0, const int *pBaseISq0, const int *pBaseIT0,
const int *pBaseI1, const int *pBaseISq1, const int *pBaseIT1,
int w, int h )
{
for ( int j = 0; j < h; j++ ) {
const int *pi0 = (const int *) ((char *) pBaseI0+j*w*sizeof(int));
const int *pi1 = (const int *) ((char *) pBaseI1+j*w*sizeof(int));
const int *pisq0 = (const int *) ((char *) pBaseISq0+j*w*sizeof(int));
const int *pisq1 = (const int *) ((char *) pBaseISq1+j*w*sizeof(int));
const int *pit0 = (const int *) ((char *) pBaseIT0+j*w*sizeof(int));
const int *pit1 = (const int *) ((char *) pBaseIT1+j*w*sizeof(int));
for ( int i = 0; i < w; i++ ) {
if ( pi0[i] != pi1[i] ||
pisq0[i] != pisq1[i] ||
pit0[i] != pit1[i] ) {
printf( "Mismatch pi[%d] = %d, reference = %d\n", i, pi0[i], pi1[i] );
printf( "Mismatch pisq[%d] = %d, reference = %d\n", i, pisq0[i], pisq1[i] );
printf( "Mismatch pit[%d] = %d, reference = %d\n", i, pit0[i], pit1[i] );
fflush( stdout );
//CH_ASSERT(0);
return 1;
}
}
}
return 0;
}
unsigned char
ReadPixel( unsigned char *base, int pitch, int w, int h, int x, int y )
{
if ( x < 0 ) x = 0;
if ( x >= w ) x = w-1;
if ( y < 0 ) y = 0;
if ( y >= h ) y = h-1;
return base[y*pitch+x];
}
void
corrCPU( float *pCorr,
int *_pI, int *_pISq, int *_pIT,
size_t CorrPitch,
int cPixels,
int xTemplate, int yTemplate,
int w, int h,
unsigned char *img, int imgPitch,
unsigned char *tmp, int tmpPitch )
{
for ( int row = 0; row < h; row += 1 ) {
float *pOut = (float *) (((char *) pCorr)+row*CorrPitch);
int *pI = (int *) (((char *) _pI)+row*CorrPitch);
int *pISq = (int *) (((char *) _pISq)+row*CorrPitch);
int *pIT = (int *) (((char *) _pIT)+row*CorrPitch);
for ( int col = 0; col < w; col += 1 ) {
int SumI = 0;
int SumT = 0;
int SumISq = 0;
int SumTSq = 0;
int SumIT = 0;
for ( int j = 0; j < cPixels; j++ ) {
unsigned char I = ReadPixel( img, imgPitch, w, h, col+poffsetx[j], row+poffsety[j] );
unsigned char T = ReadPixel( tmp, tmpPitch, w, h, xTemplate+poffsetx[j], yTemplate+poffsety[j] );
SumI += I;
SumT += T;
SumISq += I*I;
SumTSq += T*T;
SumIT += I*T;
}
float fDenomExp = (float) ((double) cPixels*SumTSq - (double) SumT*SumT);
pI[col] = SumI;
pISq[col] = SumISq;
pIT[col] = SumIT;
pOut[col] = CorrelationValue( (float) SumI, (float) SumISq, (float) SumIT, (float) SumT, (float) cPixels, fDenomExp );
}
}
}
bool
TestCorrelation(
double *pixelsPerSecond, // passbacks to report performance
double *templatePixelsPerSecond, //
int xOffset, int yOffset, // offset into image
int w, int h, // width and height of output
const float *hrefCorr, // host reference data
const int *hrefSumI,
const int *hrefSumISq,
const int *hrefSumIT,
int xTemplate, int yTemplate, // reference point in template image
int wTemplate, int hTemplate,
int wTile, // width of image tile
int sharedPitch, int sharedMem,
dim3 threads, dim3 blocks,
void (*pfnCorrelationSums)(
float *dCorr, int CorrPitch,
int *dSumI, int *dSumISq, int *dSumIT,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem ),
void (*pfnCorrelation)(
float *dCorr, int CorrPitch,
int wTile,
int wTemplate, int hTemplate,
float cPixels,
float fDenomExp,
int sharedPitch,
int xOffset, int yOffset,
int xTemplate, int yTemplate,
int xUL, int yUL, int w, int h,
dim3 threads, dim3 blocks,
int sharedMem ),
bool bPrintNeighborhood = false,
int cIterations = 1,
const char *outputFilename = NULL
)
{
cudaError_t status;
bool ret = false;
size_t CorrPitch;
float cPixels = (float) wTemplate*hTemplate;
float fDenomExp = (float) ((double) cPixels*g_cpuSumTSq - (double) g_cpuSumT*g_cpuSumT);
float *hCorr = NULL, *dCorr = NULL;
int *hSumI = NULL, *dSumI = NULL;
int *hSumISq = NULL, *dSumISq = NULL;
int *hSumIT = NULL, *dSumIT = NULL;
cudaEvent_t start = 0, stop = 0;
hCorr = (float *) malloc( w*sizeof(float)*h );
hSumI = (int *) malloc( w*sizeof(int)*h );
hSumISq = (int *) malloc( w*sizeof(int)*h );
hSumIT = (int *) malloc( w*sizeof(int)*h );
if ( NULL == hCorr || NULL == hSumI || NULL == hSumISq || NULL == hSumIT )
goto Error;
CUDART_CHECK( cudaMallocPitch( (void **) &dCorr, &CorrPitch, w*sizeof(float), h ) );
CUDART_CHECK( cudaMallocPitch( (void **) &dSumI, &CorrPitch, w*sizeof(int), h ) );
CUDART_CHECK( cudaMallocPitch( (void **) &dSumISq, &CorrPitch, w*sizeof(int), h ) );
CUDART_CHECK( cudaMallocPitch( (void **) &dSumIT, &CorrPitch, w*sizeof(int), h ) );
CUDART_CHECK( cudaMemset( dCorr, 0, CorrPitch*h ) );
CUDART_CHECK( cudaMemset( dSumI, 0, CorrPitch*h ) );
CUDART_CHECK( cudaMemset( dSumISq, 0, CorrPitch*h ) );
CUDART_CHECK( cudaMemset( dSumIT, 0, CorrPitch*h ) );
CUDART_CHECK( cudaEventCreate( &start, 0 ) );
CUDART_CHECK( cudaEventCreate( &stop, 0 ) );
pfnCorrelationSums(
dCorr, CorrPitch,
dSumI, dSumISq, dSumIT,
wTile,
wTemplate, hTemplate,
cPixels, fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
0, 0, w, h,
threads, blocks, sharedMem );
CUDART_CHECK( cudaMemcpy2D( hSumI, w*sizeof(int), dSumI, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaMemcpy2D( hSumISq, w*sizeof(int), dSumISq, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
CUDART_CHECK( cudaMemcpy2D( hSumIT, w*sizeof(int), dSumIT, CorrPitch, w*sizeof(int), h, cudaMemcpyDeviceToHost ) );
if ( bCompareSums( hSumI, hSumISq, hSumIT,
hrefSumI, hrefSumISq, hrefSumIT,
w, h ) ) {
//CH_ASSERT(0);
printf( "Sums miscompare\n" );
goto Error;
}
CUDART_CHECK( cudaMemcpy2D( hCorr, w*sizeof(float), dCorr, CorrPitch, w*sizeof(float), h, cudaMemcpyDeviceToHost ) );
if ( bCompareCorrValues( hrefCorr, hCorr, w, h ) ) {
//CH_ASSERT(0);
printf( "Correlation coefficients generated by sums kernel mismatch\n" );
return 1;
}
CUDART_CHECK( cudaMemset2D( dCorr, CorrPitch, 0, w*sizeof(float), h ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaEventRecord( start, 0 ) );
for ( int i = 0; i < cIterations; i++ ) {
pfnCorrelation(
dCorr, CorrPitch,
wTile,
wTemplate, hTemplate,
cPixels, fDenomExp,
sharedPitch,
xOffset, yOffset,
xTemplate, yTemplate,
0, 0, w, h,
threads, blocks, sharedMem );
}
CUDART_CHECK( cudaEventRecord( stop, 0 ) );
CUDART_CHECK( cudaMemcpy2D( hCorr, w*sizeof(float), dCorr, CorrPitch, w*sizeof(float), h, cudaMemcpyDeviceToHost ) );
if ( bCompareCorrValues( hrefCorr, hCorr, w, h ) ) {
CH_ASSERT(0);
printf( "Correlation coefficients generated by coefficient-only kernel mismatch\n" );
return 1;
}
{
float ms;
CUDART_CHECK( cudaEventElapsedTime( &ms, start, stop ) );
*pixelsPerSecond = (double) w*h*cIterations*1000.0 / ms;
*templatePixelsPerSecond = *pixelsPerSecond*wTemplate*hTemplate;
}
if ( bPrintNeighborhood ) {
printf( "\nNeighborhood around template:\n" );
for ( int VertOffset = -4; VertOffset <= 4; VertOffset++ ) {
const float *py = hrefCorr+w*(VertOffset+yTemplate);
for ( int HorzOffset = -4; HorzOffset <= 4; HorzOffset++ ) {
printf( "%6.2f", py[xTemplate+HorzOffset] );
}
printf("\n");
}
}
if ( outputFilename ) {
unsigned char *correlationValues = (unsigned char *) malloc( w*h );
if ( ! correlationValues ) {
status = cudaErrorMemoryAllocation;
goto Error;
}
for ( int row = 0; row < h; row++ ) {
for ( int col = 0; col < w; col++ ) {
int index = row*w+col;
float value = hCorr[index] < 0.0f ? 0.0f : logf( 1.0f+hCorr[index] )/logf(2.0f);
if ( value < 0.5f ) value = 0.0f;
value = 2.0f * (value - 0.5f);
correlationValues[index] = (unsigned char) (255.0f*value+0.5f);
}
}
if ( 0 != pgmSave( outputFilename, correlationValues, w, h ) ) {
status = cudaErrorUnknown;
goto Error;
}
free( correlationValues );
}
ret = true;
Error:
cudaEventDestroy( start );
cudaEventDestroy( stop );
free( hCorr );
free( hSumI );
free( hSumISq );
free( hSumIT );
if ( dCorr ) cudaFree( dCorr );
if ( dSumI ) cudaFree( dSumI );
if ( dSumI ) cudaFree( dSumISq );
if ( dSumI ) cudaFree( dSumIT );
return ret;
}
int
main(int argc, char *argv[])
{
int ret = 1;
cudaError_t status;
unsigned char *hidata = NULL;
unsigned char *didata = NULL;
float *hoCorrCPU = NULL;
int *hoCorrCPUI = NULL;
int *hoCorrCPUISq = NULL;
int *hoCorrCPUIT = NULL;
unsigned int HostPitch, DevicePitch;
int w, h;
int wTemplate = 52;
int hTemplate = 52;
int xOffset, yOffset;
int xTemplate = 210;
int yTemplate = 148;
int wTile;
dim3 threads;
dim3 blocks;
int sharedPitch;
int sharedMem;
char *inputFilename = "coins.pgm";
char *outputFilename = NULL;
cudaArray *pArrayImage = NULL;
cudaArray *pArrayTemplate = NULL;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
if ( chCommandLineGetBool( "help", argc, argv ) ) {
printf( "Usage:\n" );
printf( " --input <filename>: specify input filename (must be PGM)\n" );
printf( " --output <filename>: Write PGM of correlation values (0..255) to <filename>.\n" );
printf( " --padWidth <value>: pad input image width to specified value\n" );
printf( " --padHeight <value>: pad input image height to specified value\n" );
printf( " --xTemplate <value>: X coordinate of upper left corner of template\n" );
printf( " --yTemplate <value>: Y coordinate of upper left corner of template\n" );
printf( " --wTemplate <value>: Width of template\n" );
printf( " --hTemplate <value>: Height of template\n" );
printf( "\nDefault values are coins.pgm, no output file or padding, and template of the dime in the\n" );
printf("lower right corner of coins.pgm: xTemplate=210, yTemplate=148, wTemplate=hTemplate=52\n" );
return 0;
}
CUDART_CHECK( cudaSetDeviceFlags( cudaDeviceMapHost ) );
CUDART_CHECK( cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ) );
if ( chCommandLineGet( &inputFilename, "input", argc, argv ) ) {
printf( "Reading from image file %s\n", inputFilename );
}
chCommandLineGet( &outputFilename, "output", argc, argv );
{
int padWidth = 0;
int padHeight = 0;
if ( chCommandLineGet( &padWidth, "padWidth", argc, argv ) ) {
if ( ! chCommandLineGet( &padHeight, "padHeight", argc, argv ) ) {
printf( "Must specify both --padWidth and --padHeight\n" );
goto Error;
}
}
else {
if ( chCommandLineGet( &padHeight, "padHeight", argc, argv ) ) {
printf( "Must specify both --padWidth and --padHeight\n" );
goto Error;
}
}
if ( pgmLoad(inputFilename, &hidata, &HostPitch, &didata, &DevicePitch, &w, &h, padWidth, padHeight) )
goto Error;
}
chCommandLineGet( &xTemplate, "xTemplate", argc, argv );
chCommandLineGet( &yTemplate, "yTemplate", argc, argv );
chCommandLineGet( &wTemplate, "wTemplate", argc, argv );
chCommandLineGet( &hTemplate, "hTemplate", argc, argv );
xOffset = -wTemplate/2;
yOffset = -wTemplate/2;
hoCorrCPU = (float *) malloc(w*h*sizeof(float)); if ( ! hoCorrCPU ) return 1;
hoCorrCPUI = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUI ) return 1;
hoCorrCPUISq = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUISq ) return 1;
hoCorrCPUIT = (int *) malloc(w*h*sizeof(int)); if ( ! hoCorrCPUIT ) return 1;
if ( NULL == hoCorrCPU ||
NULL == hoCorrCPUI ||
NULL == hoCorrCPUISq ||
NULL == hoCorrCPUIT )
goto Error;
CUDART_CHECK( cudaMallocArray( &pArrayImage, &desc, w, h ) );
CUDART_CHECK( cudaMallocArray( &pArrayTemplate, &desc, w, h ) );
CUDART_CHECK( cudaMemcpyToArray( pArrayImage, 0, 0, hidata, w*h, cudaMemcpyHostToDevice ) );
CUDART_CHECK( cudaMemcpy2DArrayToArray( pArrayTemplate, 0, 0, pArrayImage, 0, 0, w, h, cudaMemcpyDeviceToDevice ) );
CUDART_CHECK( cudaBindTextureToArray( texImage, pArrayImage ) );
CUDART_CHECK( cudaBindTextureToArray( texTemplate, pArrayTemplate ) );
CopyToTemplate( didata, DevicePitch,
xTemplate, yTemplate,
wTemplate, hTemplate,
xOffset, yOffset );
corrCPU( hoCorrCPU, hoCorrCPUI, hoCorrCPUISq, hoCorrCPUIT,
w*sizeof(float), wTemplate*hTemplate, xTemplate-xOffset, yTemplate-yOffset, w, h,
hidata, HostPitch, hidata, HostPitch );
// height of thread block must be >= hTemplate
wTile = 32;
threads = dim3(32,8);
blocks = dim3(w/wTile+(0!=w%wTile),h/threads.y+(0!=h%threads.y));
sharedPitch = ~63&(wTile+wTemplate+63);
sharedMem = sharedPitch*(threads.y+hTemplate);
#define TEST_VECTOR( baseName, bPrintNeighborhood, cIterations, outfile ) \
{ \
double pixelsPerSecond; \
double templatePixelsPerSecond; \
if ( ! TestCorrelation( &pixelsPerSecond, \
&templatePixelsPerSecond, \
xOffset, yOffset, \
w, h, \
hoCorrCPU, \
hoCorrCPUI, \
hoCorrCPUISq, \
hoCorrCPUIT, \
xTemplate-xOffset, yTemplate-yOffset, \
wTemplate, hTemplate, \
wTile, sharedPitch, sharedMem, \
threads, blocks, \
baseName##Sums, \
baseName, \
bPrintNeighborhood, cIterations, outfile ) ) { \
printf( "Error\n" ); \
} \
printf( "%s: %.2f Mpix/s\t%.2fGtpix/s\n", \
#baseName, pixelsPerSecond/1e6, templatePixelsPerSecond/1e9 ); \
}
TEST_VECTOR( corrShared, false, 100, NULL );
// height of thread block must be >= hTemplate
wTile = 32;
threads = dim3(32,8);
blocks = dim3(w/wTile+(0!=w%wTile),h/threads.y+(0!=h%threads.y));
sharedPitch = ~63&(((wTile+wTemplate)+63));
sharedMem = sharedPitch*(threads.y+hTemplate);
TEST_VECTOR( corrSharedSM, false, 100, NULL );
TEST_VECTOR( corrShared4, false, 100, NULL );
// set up blocking parameters for 2D tex-constant formulation
threads.x = 32; threads.y = 16; threads.z = 1;
blocks.x = INTCEIL(w,threads.x); blocks.y = INTCEIL(h,threads.y); blocks.z = 1;
TEST_VECTOR( corrTexConstant, false, 100, NULL );
if ( outputFilename ) {
printf( "Writing graymap of correlation values to %s\n", outputFilename );
}
// set up blocking parameters for 2D tex-tex formulation
threads.x = 16; threads.y = 8; threads.z = 1;
blocks.x = INTCEIL(w,threads.x); blocks.y = INTCEIL(h,threads.y); blocks.z = 1;
TEST_VECTOR( corrTexTex, false, 100, outputFilename );
ret = 0;
Error:
free( hoCorrCPU );
free( hoCorrCPUI );
free( hoCorrCPUISq );
free( hoCorrCPUIT );
free( hidata );
cudaFree(didata);
cudaFreeArray(pArrayImage);
cudaFreeArray(pArrayTemplate);
return ret;
}

133
corr/pgm.cu

@ -0,0 +1,133 @@ @@ -0,0 +1,133 @@
/*
*
* pgm.cu
*
* Functions to load and store PGM (portable gray map) files.
*
* Copyright (c) 2011-2012, Archaea Software, LLC.
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "pgm.h"
int
pgmLoad(
const char *filename,
unsigned char **pHostData, unsigned int *pHostPitch,
unsigned char **pDeviceData, unsigned int *pDevicePitch,
int *pWidth, int *pHeight, int padWidth, int padHeight)
{
int ret = 1;
const int hsize = 0x40;
int w, h;
FILE *fp = NULL;
int maxval;
char header[hsize];
unsigned char *idata = NULL;
unsigned char *ddata = NULL;
size_t dPitch;
fp = fopen( filename, "rb" );
if ( fp == NULL) {
fprintf( stderr, "Failed to open %s.\n", filename );
goto Error;
}
if (NULL == fgets(header, hsize, fp)) {
fprintf(stderr, "Invalid PGM file.\n");
goto Error;
}
if ( strncmp(header, "P5", 2) ) {
fprintf(stderr, "File is not a PGM image.\n");
goto Error;
}
if ( 1 != fscanf( fp, "%d", &w ) )
goto Error;
if ( 1 != fscanf( fp, "%d", &h ) )
goto Error;
if ( 1 != fscanf( fp, "%d", &maxval ) )
goto Error;
if ( padWidth == 0 && padHeight == 0 ) {
padWidth = w;
padHeight = h;
}
idata = (unsigned char *) malloc( padWidth*padHeight );
if ( ! idata )
goto Error;
/* if (((size_t)w*h) != fread(idata, sizeof(unsigned char), w*h, fp))
goto Error;*/
for ( int row = 0; row < h; row++ ) {
if ( (size_t) w != fread( idata+row*padWidth, 1, w, fp ) )
goto Error;
}
if ( cudaSuccess != cudaMallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) )
goto Error;
*pWidth = padWidth;
*pHeight = padHeight;
*pHostPitch = padWidth;
*pHostData = idata;
*pDeviceData = ddata;
*pDevicePitch = (unsigned int) dPitch;
cudaMemcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice );
fclose(fp);
return 0;
Error:
free( idata );
cudaFree( ddata );
if ( fp ) {
fclose( fp );
}
return ret;
}
int
pgmSave(const char* filename, unsigned char *data, int w, int h)
{
int ret = 1;
FILE *fp = fopen( filename, "wb" );
if ( NULL == fp ) {
fprintf( stderr, "Failed to open %s\n", filename );
goto Error;
}
fprintf( fp, "P5\n%d\n%d\n%d\n", w, h, 0xff );
if ( w*h != fwrite(data, sizeof(unsigned char), w*h, fp) ) {
fprintf( stderr, "Write failed\n" );
goto Error;
}
fclose(fp);
ret = 0;
Error:
return ret;
}

10
corr/pgm.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef _IMAGE_H_
#define _IMAGE_H_
int pgmLoad( const char *filename,
unsigned char **pHostData, unsigned int *pSysPitch,
unsigned char **pDeviceData, unsigned int *pDevPitch,
int *pWidth, int *pHeight, int padWidth=0, int padHeight=0 );
int pgmSave( const char *filename, unsigned char *data, int w, int h);
#endif

BIN
corr/values.bmp

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Loading…
Cancel
Save