使用cuda加速opencv的edgepreservingfilter滤波。其中flag为 Recursive Filtering 。
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
4.7 KiB

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
#include <opencv2/opencv.hpp>
__global__ void kernelR(double3* src, uchar3* dst, double* maskhALL, double* maskvALL, int height, int width, float sigma_s, float sigma_r, int no_of_iter)
{
int id = threadIdx.y * blockDim.x + threadIdx.x;
double* maskh = maskhALL + height * id;
double* maskv = maskvALL + width * id;
if (id < height)
{
for (int i = 0; i < width - 1; i++)
maskv[i] = 1 + sigma_s / sigma_r * (abs(src[id * width + i + 1].x - src[id * width + i].x) +
abs(src[id * width + i + 1].y - src[id * width + i].y) +
abs(src[id * width + i + 1].z - src[id * width + i].z));
maskv[width - 1] = 1;
}
if (id < width)
{
for (int j = 0; j < height - 1; j++)
maskh[j] = 1 + sigma_s / sigma_r * (abs(src[(j + 1) * width + id].x - src[j * width + id].x) +
abs(src[(j + 1) * width + id].y - src[j * width + id].y) +
abs(src[(j + 1) * width + id].z - src[j * width + id].z));
maskh[height - 1] = 1;
}
// __syncthreads();
// printf("id = %d\n", id);
for (int i = 0; i < no_of_iter; i++)
{
float sigma_h = sigma_s * sqrt(3.0) * pow(2.0, (no_of_iter - (i + 1))) / sqrt(pow(4.0, no_of_iter) - 1);
float a = (float)exp((-1.0 * sqrt(2.0)) / sigma_h);
if (id < height)
for (int j = 1; j < width; j++)
{
src[id * width + j].x += (src[id * width + j - 1].x - src[id * width + j].x) * pow(a, maskv[j]);
src[id * width + j].y += (src[id * width + j - 1].y - src[id * width + j].y) * pow(a, maskv[j]);
src[id * width + j].z += (src[id * width + j - 1].z - src[id * width + j].z) * pow(a, maskv[j]);
}
///test
__syncthreads();
if (id < height)
for (int j = width - 2; j >= 0; j--)
{
src[id * width + j].x += (src[id * width + j + 1].x - src[id * width + j].x) * pow(a, maskv[j]);
src[id * width + j].y += (src[id * width + j + 1].y - src[id * width + j].y) * pow(a, maskv[j]);
src[id * width + j].z += (src[id * width + j + 1].z - src[id * width + j].z) * pow(a, maskv[j]);
}
__syncthreads();
if (id < width)
for (int j = 1; j < height; j++)
{
src[j * width + id].x += (src[(j - 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
src[j * width + id].y += (src[(j - 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
src[j * width + id].z += (src[(j - 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
}
__syncthreads();
if (id < width)
for (int j = height - 2; j >= 0; j--)
{
src[j * width + id].x += (src[(j + 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
src[j * width + id].y += (src[(j + 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
src[j * width + id].z += (src[(j + 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
}
__syncthreads();
}
if (id < height)
for (int j = 0; j < width; j++)
{
dst[id * width + j].x = src[id * width + j].x > 1 ? 255 : (uchar)(src[id * width + j].x * 255.0);
dst[id * width + j].y = src[id * width + j].y > 1 ? 255 : (uchar)(src[id * width + j].y * 255.0);
dst[id * width + j].z = src[id * width + j].z > 1 ? 255 : (uchar)(src[id * width + j].z * 255.0);
}
}
int main()
{
cudaError_t err;
cv::Mat image = cv::imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
cv::Mat dst = cv::Mat(image.rows, image.cols, CV_8UC3);
double3* src_GPU;
uchar3* dst_GPU;
cudaMalloc(&src_GPU, image.rows * image.cols * 3 * 8);
cudaMalloc(&dst_GPU, image.rows * image.cols * 3);
double* maskv;
double* maskh;
cudaMalloc(&maskv, sizeof(double) * image.cols * image.rows);
cudaMalloc(&maskh, sizeof(double) * image.cols * image.rows);
std::chrono::time_point<std::chrono::steady_clock> start, end;
start = std::chrono::high_resolution_clock::now();
cv::Mat temp;
image.convertTo(temp, CV_64FC3, 1.0 / 255.0);
cudaMemcpy(src_GPU, temp.data, image.rows * image.cols * 3 * 8, cudaMemcpyHostToDevice);
kernelR << <1, dim3(32, 32) >> > (src_GPU, dst_GPU, maskv, maskh, image.rows, image.cols, 43, 0.7, 3);
cudaDeviceSynchronize();
cudaMemcpy(dst.data, dst_GPU, dst.rows * dst.cols * 3, cudaMemcpyDeviceToHost);
end = std::chrono::high_resolution_clock::now();
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
start = std::chrono::high_resolution_clock::now();
cv::Mat edge;
cv::edgePreservingFilter(image, edge, 1, 43, 0.7);
end = std::chrono::high_resolution_clock::now();
std::cout << "cpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
cv::imshow("cpu", edge);
cv::imshow("gpu ", dst);
cv::imshow("sub", dst - edge);
cv::imshow("sub2", edge - dst);
cv::waitKey(0);
return 0;
}