|
|
@ -0,0 +1,274 @@ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "cuda_runtime.h" |
|
|
|
|
|
|
|
#include "device_launch_parameters.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h> |
|
|
|
|
|
|
|
#include <math.h> |
|
|
|
|
|
|
|
#include <opencv2/opencv.hpp> |
|
|
|
|
|
|
|
#include <chrono> |
|
|
|
|
|
|
|
using namespace cv; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define threadX 16 |
|
|
|
|
|
|
|
#define threadY 16 |
|
|
|
|
|
|
|
#define threadZ 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary> |
|
|
|
|
|
|
|
/// 双边滤波的核函数,边界以0填充 |
|
|
|
|
|
|
|
/// </summary> |
|
|
|
|
|
|
|
/// <param name="src"></param> |
|
|
|
|
|
|
|
/// <param name="dst"></param> |
|
|
|
|
|
|
|
/// <param name="width"></param> |
|
|
|
|
|
|
|
/// <param name="height"></param> |
|
|
|
|
|
|
|
/// <param name="r"></param> |
|
|
|
|
|
|
|
/// <param name="sigmaC"></param> |
|
|
|
|
|
|
|
/// <param name="sigmaS"></param> |
|
|
|
|
|
|
|
/// <returns></returns> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void bilakernel(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaS) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int x = threadIdx.x + blockDim.x * blockIdx.x; |
|
|
|
|
|
|
|
int y = threadIdx.y + blockDim.y * blockIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (x >= width || y >= height)return; |
|
|
|
|
|
|
|
double mask = 0; |
|
|
|
|
|
|
|
double3 result; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// mask.x = 0; |
|
|
|
|
|
|
|
// mask.y = 0; |
|
|
|
|
|
|
|
// mask.z = 0; |
|
|
|
|
|
|
|
result.x = 0; |
|
|
|
|
|
|
|
result.y = 0; |
|
|
|
|
|
|
|
result.z = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uchar3 origin = src[x + y * width]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = -r; i <= r; i++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int j = -r; j <= r; j++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uchar3 tmp = src[x + i + (y + j) * width]; |
|
|
|
|
|
|
|
int3 sub; |
|
|
|
|
|
|
|
sub.x = tmp.x - origin.x; |
|
|
|
|
|
|
|
sub.y = tmp.y - origin.y; |
|
|
|
|
|
|
|
sub.z = tmp.z - origin.z; |
|
|
|
|
|
|
|
double masktmp = (exp(-0.5 / sigmaC / sigmaC * sub.x * sub.x) + |
|
|
|
|
|
|
|
exp(-0.5 / sigmaC / sigmaC * sub.y * sub.y) + |
|
|
|
|
|
|
|
exp(-0.5 / sigmaC / sigmaC * sub.z * sub.z)) * |
|
|
|
|
|
|
|
exp(-0.5 / sigmaS / sigmaS * (i * i + j * j)); |
|
|
|
|
|
|
|
mask += masktmp; |
|
|
|
|
|
|
|
result.x += tmp.x * masktmp; |
|
|
|
|
|
|
|
result.y += tmp.y * masktmp; |
|
|
|
|
|
|
|
result.z += tmp.z * masktmp; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (mask > 0) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
dst[x + y * width].x = result.x / mask; |
|
|
|
|
|
|
|
dst[x + y * width].y = result.y / mask; |
|
|
|
|
|
|
|
dst[x + y * width].z = result.z / mask; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
dst[x + y * width].x = 0; |
|
|
|
|
|
|
|
dst[x + y * width].y = 0; |
|
|
|
|
|
|
|
dst[x + y * width].z = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary> |
|
|
|
|
|
|
|
/// 双边滤波的核函数,通过共享内存加速 |
|
|
|
|
|
|
|
/// </summary> |
|
|
|
|
|
|
|
/// <param name="src"></param> |
|
|
|
|
|
|
|
/// <param name="dst"></param> |
|
|
|
|
|
|
|
/// <param name="width"></param> |
|
|
|
|
|
|
|
/// <param name="height"></param> |
|
|
|
|
|
|
|
/// <param name="r"></param> |
|
|
|
|
|
|
|
/// <param name="sigmaC"></param> |
|
|
|
|
|
|
|
/// <param name="sigmaD"></param> |
|
|
|
|
|
|
|
/// <returns></returns> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void bilakernel_v2(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaD) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int x = threadIdx.x + blockDim.x * blockIdx.x; |
|
|
|
|
|
|
|
int y = threadIdx.y + blockDim.y * blockIdx.y; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (x >= width || y >= height)return; |
|
|
|
|
|
|
|
const int blocksize = (2 * r + threadX) * (2 * r + threadY); |
|
|
|
|
|
|
|
const int id = x + y * width; |
|
|
|
|
|
|
|
extern __shared__ uchar3 box[];// (2 * threadX)* (2 * threadY)]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int n = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while ((threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y) < (2 * r + blockDim.x) * (2 * r + blockDim.y)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int localIndex = threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y; |
|
|
|
|
|
|
|
int x_temp = localIndex % (blockDim.x + 2 * r); |
|
|
|
|
|
|
|
int y_temp = localIndex / (blockDim.x + 2 * r); |
|
|
|
|
|
|
|
int x_real = x_temp - r + blockDim.x * blockIdx.x; |
|
|
|
|
|
|
|
int y_real = y_temp - r + blockDim.y * blockIdx.y; |
|
|
|
|
|
|
|
if (x_real < 0 || x_real >= width || y_real < 0 || y_real >= height) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
box[localIndex].x = 0; |
|
|
|
|
|
|
|
box[localIndex].y = 0; |
|
|
|
|
|
|
|
box[localIndex].z = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
box[localIndex] = src[x_real + y_real * width]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
n++; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
double coeff = 0; |
|
|
|
|
|
|
|
double sum[3] = { 0 }; |
|
|
|
|
|
|
|
int idCenter = r + threadIdx.x + (r + threadIdx.y) * (r * 2 + blockDim.y); |
|
|
|
|
|
|
|
for (int i = -r; i <= r; i++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int j = -r; j <= r; j++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int idCur = r + i + threadIdx.x + (r + j + threadIdx.y) * (r * 2 + blockDim.y); |
|
|
|
|
|
|
|
double tmp = ( |
|
|
|
|
|
|
|
exp(-0.5 / sigmaC / sigmaC * (box[idCur].x - box[idCenter].x) * (box[idCur].x - box[idCenter].x)) + |
|
|
|
|
|
|
|
exp(-0.5 / sigmaC / sigmaC * (box[idCur].y - box[idCenter].y) * (box[idCur].y - box[idCenter].y)) + |
|
|
|
|
|
|
|
exp(-0.5 / sigmaC / sigmaC * (box[idCur].z - box[idCenter].z) * (box[idCur].z - box[idCenter].z)) |
|
|
|
|
|
|
|
) * |
|
|
|
|
|
|
|
exp(-0.5 / sigmaD / sigmaD * |
|
|
|
|
|
|
|
(i * i + j * j)); |
|
|
|
|
|
|
|
coeff += tmp; |
|
|
|
|
|
|
|
sum[0] += tmp * box[idCur].x; |
|
|
|
|
|
|
|
sum[1] += tmp * box[idCur].y; |
|
|
|
|
|
|
|
sum[2] += tmp * box[idCur].z; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (coeff > 0) { |
|
|
|
|
|
|
|
dst[id].x = (uchar)(sum[0] / coeff); |
|
|
|
|
|
|
|
dst[id].y = (uchar)(sum[1] / coeff); |
|
|
|
|
|
|
|
dst[id].z = (uchar)(sum[2] / coeff); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
dst[id].x = 0; |
|
|
|
|
|
|
|
dst[id].y = 0; |
|
|
|
|
|
|
|
dst[id].z = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
|
|
|
* \brief 模板类 |
|
|
|
|
|
|
|
* \tparam N 共有N个流来加速 |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
template <int N> |
|
|
|
|
|
|
|
class cuFilter |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
public: |
|
|
|
|
|
|
|
cuFilter(dim3 g, dim3 b, int rows, int cols, int channels, int smSize) :grid(g), block(b), sharedMemSize(smSize) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int i = 0; i < N; i++) { |
|
|
|
|
|
|
|
cudaStreamCreate(&streams[i]); |
|
|
|
|
|
|
|
cudaMalloc(&srcmem[i], rows * cols * channels); |
|
|
|
|
|
|
|
cudaMalloc(&dstmem[i], rows * cols * channels); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
nFidx = 0; |
|
|
|
|
|
|
|
inited = false; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
~cuFilter() |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int i = 0; i < N; i++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
cudaStreamSynchronize(streams[i]); |
|
|
|
|
|
|
|
cudaFree(srcmem[i]); |
|
|
|
|
|
|
|
cudaFree(dstmem[i]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void filter(Mat src, Mat& dst, int r, double sigmaC, double sigmaD) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
cudaMemcpy(srcmem[nFidx], src.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
bilakernel_v2 << <grid, block, sharedMemSize, streams[nFidx] >> > (srcmem[nFidx], dstmem[nFidx], src.rows, src.cols, r, sigmaC, sigmaD); |
|
|
|
|
|
|
|
if (inited) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
nFidx += 1; |
|
|
|
|
|
|
|
if (nFidx == N) nFidx = 0; |
|
|
|
|
|
|
|
cudaStreamSynchronize(streams[nFidx]); |
|
|
|
|
|
|
|
src.copyTo(dst); |
|
|
|
|
|
|
|
cudaMemcpy(dst.data, dstmem[nFidx], src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
nFidx++; |
|
|
|
|
|
|
|
if (nFidx + 1 == N) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
inited = true; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dim3 grid; |
|
|
|
|
|
|
|
dim3 block; |
|
|
|
|
|
|
|
cudaStream_t streams[N]; |
|
|
|
|
|
|
|
int nFidx; |
|
|
|
|
|
|
|
int sharedMemSize; |
|
|
|
|
|
|
|
bool inited; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uchar3* srcmem[N]; |
|
|
|
|
|
|
|
uchar3* dstmem[N]; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int main() |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::cout << "hello world" << std::endl; |
|
|
|
|
|
|
|
cuFilter<64> filter(dim3(32, 32), dim3(16, 16), 512, 512, 3, 32 * 32 * 16); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Mat image = imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png"); |
|
|
|
|
|
|
|
Mat dst; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::chrono::time_point<std::chrono::steady_clock> start, end; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
for(int i = 0 ;i< 100; i++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
filter.filter(image, dst, 4, 15, 23); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
for(int i = 0; i< 100;i++) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
bilateralFilter(image, dst, 8, 15, 23); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |