init

2 years ago · 4659eb96ec
2 changed files with 299 additions and 0 deletions
--- a/cuBilaStream.sln
+++ b/cuBilaStream.sln
@ -0,0 +1,25 @@
				@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.5.33627.172
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuBilaStream", "cuBilaStream\cuBilaStream.vcxproj", "{75C221AE-D06F-4181-866E-F78F00547173}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.ActiveCfg = Debug|x64
+		{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.Build.0 = Debug|x64
+		{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.ActiveCfg = Release|x64
+		{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {C986B33D-AD9E-4EE4-BD4C-C49A516FCF54}
+	EndGlobalSection
+EndGlobal
--- a/cuBilaStream/kernel.cu
+++ b/cuBilaStream/kernel.cu
@ -0,0 +1,274 @@
				@@ -0,0 +1,274 @@
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <math.h>
+#include <opencv2/opencv.hpp>
+#include <chrono>
+using namespace cv;
+
+#define threadX 16
+#define threadY 16
+#define threadZ 1
+
+
+/// <summary>
+/// 双边滤波的核函数，边界以0填充
+/// </summary>
+/// <param name="src"></param>
+/// <param name="dst"></param>
+/// <param name="width"></param>
+/// <param name="height"></param>
+/// <param name="r"></param>
+/// <param name="sigmaC"></param>
+/// <param name="sigmaS"></param>
+/// <returns></returns>
+
+__global__ void bilakernel(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaS)
+{
+	int x = threadIdx.x + blockDim.x * blockIdx.x;
+	int y = threadIdx.y + blockDim.y * blockIdx.y;
+
+	if (x >= width || y >= height)return;
+	double mask = 0;
+	double3 result;
+
+	// mask.x = 0;
+	// mask.y = 0;
+	// mask.z = 0;
+	result.x = 0;
+	result.y = 0;
+	result.z = 0;
+
+	uchar3 origin = src[x + y * width];
+
+	for (int i = -r; i <= r; i++)
+	{
+		for (int j = -r; j <= r; j++)
+		{
+			if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
+			{
+				uchar3 tmp = src[x + i + (y + j) * width];
+				int3 sub;
+				sub.x = tmp.x - origin.x;
+				sub.y = tmp.y - origin.y;
+				sub.z = tmp.z - origin.z;
+				double masktmp = (exp(-0.5 / sigmaC / sigmaC * sub.x * sub.x) +
+					exp(-0.5 / sigmaC / sigmaC * sub.y * sub.y) +
+					exp(-0.5 / sigmaC / sigmaC * sub.z * sub.z)) *
+					exp(-0.5 / sigmaS / sigmaS * (i * i + j * j));
+				mask += masktmp;
+				result.x += tmp.x * masktmp;
+				result.y += tmp.y * masktmp;
+				result.z += tmp.z * masktmp;
+			}
+		}
+	}
+
+	if (mask > 0)
+	{
+		dst[x + y * width].x = result.x / mask;
+		dst[x + y * width].y = result.y / mask;
+		dst[x + y * width].z = result.z / mask;
+	}
+	else
+	{
+		dst[x + y * width].x = 0;
+		dst[x + y * width].y = 0;
+		dst[x + y * width].z = 0;
+	}
+
+
+}
+
+/// <summary>
+/// 双边滤波的核函数，通过共享内存加速
+/// </summary>
+/// <param name="src"></param>
+/// <param name="dst"></param>
+/// <param name="width"></param>
+/// <param name="height"></param>
+/// <param name="r"></param>
+/// <param name="sigmaC"></param>
+/// <param name="sigmaD"></param>
+/// <returns></returns>
+
+__global__ void bilakernel_v2(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaD)
+{
+
+	int x = threadIdx.x + blockDim.x * blockIdx.x;
+	int y = threadIdx.y + blockDim.y * blockIdx.y;
+
+
+
+	if (x >= width || y >= height)return;
+	const int blocksize = (2 * r + threadX) * (2 * r + threadY);
+	const int id = x + y * width;
+	extern  __shared__ uchar3 box[];// (2 * threadX)* (2 * threadY)];
+
+
+	int n = 0;
+
+
+	while ((threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y) < (2 * r + blockDim.x) * (2 * r + blockDim.y))
+
+	{
+		int localIndex = threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y;
+		int x_temp = localIndex % (blockDim.x + 2 * r);
+		int y_temp = localIndex / (blockDim.x + 2 * r);
+		int x_real = x_temp - r + blockDim.x * blockIdx.x;
+		int y_real = y_temp - r + blockDim.y * blockIdx.y;
+		if (x_real < 0 || x_real >= width || y_real < 0 || y_real >= height)
+		{
+			box[localIndex].x = 0;
+			box[localIndex].y = 0;
+			box[localIndex].z = 0;
+		}
+		else
+		{
+			box[localIndex] = src[x_real + y_real * width];
+		}
+		n++;
+	}
+	__syncthreads();
+
+
+
+
+	double coeff = 0;
+	double sum[3] = { 0 };
+	int idCenter = r + threadIdx.x + (r + threadIdx.y) * (r * 2 + blockDim.y);
+	for (int i = -r; i <= r; i++)
+	{
+		for (int j = -r; j <= r; j++)
+		{
+			if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
+			{
+				int idCur = r + i + threadIdx.x + (r + j + threadIdx.y) * (r * 2 + blockDim.y);
+				double tmp = (
+					exp(-0.5 / sigmaC / sigmaC * (box[idCur].x - box[idCenter].x) * (box[idCur].x - box[idCenter].x)) +
+					exp(-0.5 / sigmaC / sigmaC * (box[idCur].y - box[idCenter].y) * (box[idCur].y - box[idCenter].y)) +
+					exp(-0.5 / sigmaC / sigmaC * (box[idCur].z - box[idCenter].z) * (box[idCur].z - box[idCenter].z))
+					) *
+					exp(-0.5 / sigmaD / sigmaD *
+						(i * i + j * j));
+				coeff += tmp;
+				sum[0] += tmp * box[idCur].x;
+				sum[1] += tmp * box[idCur].y;
+				sum[2] += tmp * box[idCur].z;
+			}
+		}
+	}
+
+	if (coeff > 0) {
+		dst[id].x = (uchar)(sum[0] / coeff);
+		dst[id].y = (uchar)(sum[1] / coeff);
+		dst[id].z = (uchar)(sum[2] / coeff);
+	}
+	else
+	{
+		dst[id].x = 0;
+		dst[id].y = 0;
+		dst[id].z = 0;
+	}
+}
+
+
+/**
+ * \brief 模板类
+ * \tparam N 共有N个流来加速
+ */
+template <int N>
+class cuFilter
+{
+public:
+	cuFilter(dim3 g, dim3 b, int rows, int cols, int channels, int smSize) :grid(g), block(b), sharedMemSize(smSize)
+	{
+		for (int i = 0; i < N; i++) {
+			cudaStreamCreate(&streams[i]);
+			cudaMalloc(&srcmem[i], rows * cols * channels);
+			cudaMalloc(&dstmem[i], rows * cols * channels);
+		}
+		nFidx = 0;
+		inited = false;
+	}
+	~cuFilter()
+	{
+		for (int i = 0; i < N; i++)
+		{
+			cudaStreamSynchronize(streams[i]);
+			cudaFree(srcmem[i]);
+			cudaFree(dstmem[i]);
+		}
+	}
+
+	void filter(Mat src, Mat& dst, int r, double sigmaC, double sigmaD)
+	{
+		cudaMemcpy(srcmem[nFidx], src.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice);
+		bilakernel_v2 << <grid, block, sharedMemSize, streams[nFidx] >> > (srcmem[nFidx], dstmem[nFidx], src.rows, src.cols, r, sigmaC, sigmaD);
+		if (inited)
+		{
+			nFidx += 1;
+			if (nFidx == N) nFidx = 0;
+			cudaStreamSynchronize(streams[nFidx]);
+			src.copyTo(dst);
+			cudaMemcpy(dst.data, dstmem[nFidx], src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost);
+
+		}
+		else
+		{
+			nFidx++;
+			if (nFidx + 1 == N)
+			{
+				inited = true;
+			}
+
+		}
+
+	}
+
+	dim3 grid;
+	dim3 block;
+	cudaStream_t streams[N];
+	int nFidx;
+	int sharedMemSize;
+	bool inited;
+
+	uchar3* srcmem[N];
+	uchar3* dstmem[N];
+};
+
+
+int main()
+{
+
+	std::cout << "hello world" << std::endl;
+	cuFilter<64> filter(dim3(32, 32), dim3(16, 16), 512, 512, 3, 32 * 32 * 16);
+
+	Mat image = imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
+	Mat dst;
+
+
+
+	std::chrono::time_point<std::chrono::steady_clock> start, end;
+
+	start = std::chrono::high_resolution_clock::now();
+	for(int i = 0 ;i< 100; i++)
+	{
+		filter.filter(image, dst, 4, 15, 23);		
+	}
+
+	end = std::chrono::high_resolution_clock::now();
+	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
+
+	start = std::chrono::high_resolution_clock::now();
+	for(int i = 0; i< 100;i++)
+	{
+		bilateralFilter(image, dst, 8, 15, 23);
+	}
+
+	end = std::chrono::high_resolution_clock::now();
+	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
+
+}