init

2 years ago · b92d10bc09
2 changed files with 159 additions and 0 deletions
--- a/cuRFilter.sln
+++ b/cuRFilter.sln
@ -0,0 +1,25 @@
				@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.5.33627.172
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuRFilter", "cuRFilter\cuRFilter.vcxproj", "{F8C4AC1F-11D9-48AE-B3A7-D4AFAA6DEEA4}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F8C4AC1F-11D9-48AE-B3A7-D4AFAA6DEEA4}.Debug|x64.ActiveCfg = Debug|x64
+		{F8C4AC1F-11D9-48AE-B3A7-D4AFAA6DEEA4}.Debug|x64.Build.0 = Debug|x64
+		{F8C4AC1F-11D9-48AE-B3A7-D4AFAA6DEEA4}.Release|x64.ActiveCfg = Release|x64
+		{F8C4AC1F-11D9-48AE-B3A7-D4AFAA6DEEA4}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {9176E986-5B87-4096-B603-4E14C8A0CB65}
+	EndGlobalSection
+EndGlobal
--- a/cuRFilter/kernel.cu
+++ b/cuRFilter/kernel.cu
@ -0,0 +1,134 @@
				@@ -0,0 +1,134 @@
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <math.h>
+#include <stdio.h>
+#include  <opencv2/opencv.hpp>
+
+
+__global__ void kernelR(double3* src, uchar3* dst, double* maskhALL, double* maskvALL, int height, int width, float sigma_s, float sigma_r, int no_of_iter)
+{
+	int id = threadIdx.y * blockDim.x + threadIdx.x;
+	double* maskh = maskhALL + height * id;
+	double* maskv = maskvALL + width * id;
+
+	if (id < height)
+	{
+		for (int i = 0; i < width - 1; i++)
+			maskv[i] = 1 + sigma_s / sigma_r * (abs(src[id * width + i + 1].x - src[id * width + i].x) +
+				abs(src[id * width + i + 1].y - src[id * width + i].y) +
+				abs(src[id * width + i + 1].z - src[id * width + i].z));
+		maskv[width - 1] = 1;
+	}
+	if (id < width)
+	{
+		for (int j = 0; j < height - 1; j++)
+			maskh[j] = 1 + sigma_s / sigma_r * (abs(src[(j + 1) * width + id].x - src[j * width + id].x) +
+				abs(src[(j + 1) * width + id].y - src[j * width + id].y) +
+				abs(src[(j + 1) * width + id].z - src[j * width + id].z));
+		maskh[height - 1] = 1;
+	}
+	// __syncthreads();
+	// printf("id = %d\n", id);
+
+
+	for (int i = 0; i < no_of_iter; i++)
+	{
+		float sigma_h = sigma_s * sqrt(3.0) * pow(2.0, (no_of_iter - (i + 1))) / sqrt(pow(4.0, no_of_iter) - 1);
+		float a = (float)exp((-1.0 * sqrt(2.0)) / sigma_h);
+
+
+		if (id < height)
+			for (int j = 1; j < width; j++)
+			{
+				src[id * width + j].x += (src[id * width + j - 1].x - src[id * width + j].x) * pow(a, maskv[j]);
+				src[id * width + j].y += (src[id * width + j - 1].y - src[id * width + j].y) * pow(a, maskv[j]);
+				src[id * width + j].z += (src[id * width + j - 1].z - src[id * width + j].z) * pow(a, maskv[j]);
+			}
+
+		///test 
+
+		__syncthreads();
+		if (id < height)
+			for (int j = width - 2; j >= 0; j--)
+			{
+				src[id * width + j].x += (src[id * width + j + 1].x - src[id * width + j].x) * pow(a, maskv[j]);
+				src[id * width + j].y += (src[id * width + j + 1].y - src[id * width + j].y) * pow(a, maskv[j]);
+				src[id * width + j].z += (src[id * width + j + 1].z - src[id * width + j].z) * pow(a, maskv[j]);
+			}
+
+		__syncthreads();
+		if (id < width)
+			for (int j = 1; j < height; j++)
+			{
+				src[j * width + id].x += (src[(j - 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
+				src[j * width + id].y += (src[(j - 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
+				src[j * width + id].z += (src[(j - 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
+			}
+		__syncthreads();
+		if (id < width)
+			for (int j = height - 2; j >= 0; j--)
+			{
+				src[j * width + id].x += (src[(j + 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]);
+				src[j * width + id].y += (src[(j + 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]);
+				src[j * width + id].z += (src[(j + 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]);
+			}
+		__syncthreads();
+	}
+	if (id < height)
+		for (int j = 0; j < width; j++)
+		{
+			dst[id * width + j].x = src[id * width + j].x > 1 ? 255 : (uchar)(src[id * width + j].x * 255.0);
+			dst[id * width + j].y = src[id * width + j].y > 1 ? 255 : (uchar)(src[id * width + j].y * 255.0);
+			dst[id * width + j].z = src[id * width + j].z > 1 ? 255 : (uchar)(src[id * width + j].z * 255.0);
+
+		}
+}
+
+
+
+int main()
+{
+	cudaError_t err;
+
+	cv::Mat image = cv::imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
+	cv::Mat dst = cv::Mat(image.rows, image.cols, CV_8UC3);
+	double3* src_GPU;
+	uchar3* dst_GPU;
+	cudaMalloc(&src_GPU, image.rows * image.cols * 3 * 8);
+	cudaMalloc(&dst_GPU, image.rows * image.cols * 3);
+
+	double* maskv;
+	double* maskh;
+
+	cudaMalloc(&maskv, sizeof(double) * image.cols * image.rows);
+	cudaMalloc(&maskh, sizeof(double) * image.cols * image.rows);
+
+
+	std::chrono::time_point<std::chrono::steady_clock> start, end;
+	start = std::chrono::high_resolution_clock::now();
+	cv::Mat temp;
+	image.convertTo(temp, CV_64FC3, 1.0 / 255.0);
+	cudaMemcpy(src_GPU, temp.data, image.rows * image.cols * 3 * 8, cudaMemcpyHostToDevice);
+	kernelR << <1, dim3(32, 32) >> > (src_GPU, dst_GPU, maskv, maskh, image.rows, image.cols, 43, 0.7, 3);
+	cudaDeviceSynchronize();
+	cudaMemcpy(dst.data, dst_GPU, dst.rows * dst.cols * 3, cudaMemcpyDeviceToHost);
+	end = std::chrono::high_resolution_clock::now();
+	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
+
+	
+	start = std::chrono::high_resolution_clock::now();
+	cv::Mat edge;
+	cv::edgePreservingFilter(image, edge, 1, 43, 0.7);
+	end = std::chrono::high_resolution_clock::now();
+	std::cout << "cpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
+
+	cv::imshow("cpu", edge);
+	cv::imshow("gpu ", dst);
+	cv::imshow("sub", dst - edge);
+	cv::imshow("sub2", edge - dst);
+
+	cv::waitKey(0);
+	return 0;
+}