cuEdgeRfilter/cuRFilter/kernel.cu


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <math.h>
#include <stdio.h>
#include  <opencv2/opencv.hpp>


__global__ void kernelR(double3* src, uchar3* dst, double* maskhALL, double* maskvALL, int height, int width, float sigma_s, float sigma_r, int no_of_iter)
{
	int id = threadIdx.y * blockDim.x + threadIdx.x;
	double* maskh = maskhALL + height * id;
	double* maskv = maskvALL + width * id;

	if (id < height)
	{
		for (int i = 0; i < width - 1; i++)
			maskv[i] = 1 + sigma_s / sigma_r * (abs(src[id * width + i + 1].x - src[id * width + i].x) +
				abs(src[id * width + i + 1].y - src[id * width + i].y) +
				abs(src[id * width + i + 1].z - src[id * width + i].z));
		maskv[width - 1] = 1;
	}
	if (id < width)
	{
		for (int j = 0; j < height - 1; j++)
			maskh[j] = 1 + sigma_s / sigma_r * (abs(src[(j + 1) * width + id].x - src[j * width + id].x) +
				abs(src[(j + 1) * width + id].y - src[j * width + id].y) +
				abs(src[(j + 1) * width + id].z - src[j * width + id].z));
		maskh[height - 1] = 1;
	}
	// __syncthreads();
	// printf("id = %d\n", id);


	for (int i = 0; i < no_of_iter; i++)
	{
		float sigma_h = sigma_s * sqrt(3.0) * pow(2.0, (no_of_iter - (i + 1))) / sqrt(pow(4.0, no_of_iter) - 1);
		float a = (float)exp((-1.0 * sqrt(2.0)) / sigma_h);


		if (id < height)
			for (int j = 1; j < width; j++)
			{
				src[id * width + j].x += (src[id * width + j - 1].x - src[id * width + j].x) * pow(a, maskv[j]);
				src[id * width + j].y += (src[id * width + j - 1].y - src[id * width + j].y) * pow(a, maskv[j]);
				src[id * width + j].z += (src[id * width + j - 1].z - src[id * width + j].z) * pow(a, maskv[j]);
			}

		///test

		__syncthreads();
		if (id < height)
			for (int j = width - 2; j >= 0; j--)
			{
				src[id * width + j].x += (src[id * width + j + 1].x - src[id * width + j].x) * pow(a, maskv[j]);
				src[id * width + j].y += (src[id * width + j + 1].y - src[id * width + j].y) * pow(a, maskv[j]);
				src[id * width + j].z += (src[id * width + j + 1].z - src[id * width + j].z) * pow(a, maskv[j]);
			}

		__syncthreads();
		if (id < width)
			for (int j = 1; j < height; j++)
			{
				src[j * width + id].x += (src[(j - 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
				src[j * width + id].y += (src[(j - 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
				src[j * width + id].z += (src[(j - 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
			}
		__syncthreads();
		if (id < width)
			for (int j = height - 2; j >= 0; j--)
			{
				src[j * width + id].x += (src[(j + 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
				src[j * width + id].y += (src[(j + 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
				src[j * width + id].z += (src[(j + 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
			}
		__syncthreads();
	}
	if (id < height)
		for (int j = 0; j < width; j++)
		{
			dst[id * width + j].x = src[id * width + j].x > 1 ? 255 : (uchar)(src[id * width + j].x * 255.0);
			dst[id * width + j].y = src[id * width + j].y > 1 ? 255 : (uchar)(src[id * width + j].y * 255.0);
			dst[id * width + j].z = src[id * width + j].z > 1 ? 255 : (uchar)(src[id * width + j].z * 255.0);

		}
}


int main()
{
	cudaError_t err;

	cv::Mat image = cv::imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
	cv::Mat dst = cv::Mat(image.rows, image.cols, CV_8UC3);
	double3* src_GPU;
	uchar3* dst_GPU;
	cudaMalloc(&src_GPU, image.rows * image.cols * 3 * 8);
	cudaMalloc(&dst_GPU, image.rows * image.cols * 3);

	double* maskv;
	double* maskh;

	cudaMalloc(&maskv, sizeof(double) * image.cols * image.rows);
	cudaMalloc(&maskh, sizeof(double) * image.cols * image.rows);


	std::chrono::time_point<std::chrono::steady_clock> start, end;
	start = std::chrono::high_resolution_clock::now();
	cv::Mat temp;
	image.convertTo(temp, CV_64FC3, 1.0 / 255.0);
	cudaMemcpy(src_GPU, temp.data, image.rows * image.cols * 3 * 8, cudaMemcpyHostToDevice);
	kernelR << <1, dim3(32, 32) >> > (src_GPU, dst_GPU, maskv, maskh, image.rows, image.cols, 43, 0.7, 3);
	cudaDeviceSynchronize();
	cudaMemcpy(dst.data, dst_GPU, dst.rows * dst.cols * 3, cudaMemcpyDeviceToHost);
	end = std::chrono::high_resolution_clock::now();
	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;


	start = std::chrono::high_resolution_clock::now();
	cv::Mat edge;
	cv::edgePreservingFilter(image, edge, 1, 43, 0.7);
	end = std::chrono::high_resolution_clock::now();
	std::cout << "cpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;

	cv::imshow("cpu", edge);
	cv::imshow("gpu ", dst);
	cv::imshow("sub", dst - edge);
	cv::imshow("sub2", edge - dst);

	cv::waitKey(0);
	return 0;
}