init

2 years ago · 4659eb96ec
2 changed files with 299 additions and 0 deletions
--- a/cuBilaStream.sln
+++ b/cuBilaStream.sln
@ -0,0 +1,25 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.5.33627.172
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuBilaStream", "cuBilaStream\cuBilaStream.vcxproj", "{75C221AE-D06F-4181-866E-F78F00547173}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.ActiveCfg = Debug|x64
 		{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.Build.0 = Debug|x64
 		{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.ActiveCfg = Release|x64
 		{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {C986B33D-AD9E-4EE4-BD4C-C49A516FCF54}
 	EndGlobalSection
 EndGlobal
--- a/cuBilaStream/kernel.cu
+++ b/cuBilaStream/kernel.cu
@ -0,0 +1,274 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #include <stdio.h>
 #include <math.h>
 #include <opencv2/opencv.hpp>
 #include <chrono>
 using namespace cv;
 #define threadX 16
 #define threadY 16
 #define threadZ 1
 /// <summary>
 /// 双边滤波的核函数，边界以0填充
 /// </summary>
 /// <param name="src"></param>
 /// <param name="dst"></param>
 /// <param name="width"></param>
 /// <param name="height"></param>
 /// <param name="r"></param>
 /// <param name="sigmaC"></param>
 /// <param name="sigmaS"></param>
 /// <returns></returns>
 __global__ void bilakernel(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaS)
 {
 	int x = threadIdx.x + blockDim.x * blockIdx.x;
 	int y = threadIdx.y + blockDim.y * blockIdx.y;
 	if (x >= width || y >= height)return;
 	double mask = 0;
 	double3 result;
 	// mask.x = 0;
 	// mask.y = 0;
 	// mask.z = 0;
 	result.x = 0;
 	result.y = 0;
 	result.z = 0;
 	uchar3 origin = src[x + y * width];
 	for (int i = -r; i <= r; i++)
 	{
 		for (int j = -r; j <= r; j++)
 		{
 			if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
 			{
 				uchar3 tmp = src[x + i + (y + j) * width];
 				int3 sub;
 				sub.x = tmp.x - origin.x;
 				sub.y = tmp.y - origin.y;
 				sub.z = tmp.z - origin.z;
 				double masktmp = (exp(-0.5 / sigmaC / sigmaC * sub.x * sub.x) +
 					exp(-0.5 / sigmaC / sigmaC * sub.y * sub.y) +
 					exp(-0.5 / sigmaC / sigmaC * sub.z * sub.z)) *
 					exp(-0.5 / sigmaS / sigmaS * (i * i + j * j));
 				mask += masktmp;
 				result.x += tmp.x * masktmp;
 				result.y += tmp.y * masktmp;
 				result.z += tmp.z * masktmp;
 			}
 		}
 	}
 	if (mask > 0)
 	{
 		dst[x + y * width].x = result.x / mask;
 		dst[x + y * width].y = result.y / mask;
 		dst[x + y * width].z = result.z / mask;
 	}
 	else
 	{
 		dst[x + y * width].x = 0;
 		dst[x + y * width].y = 0;
 		dst[x + y * width].z = 0;
 	}
 }
 /// <summary>
 /// 双边滤波的核函数，通过共享内存加速
 /// </summary>
 /// <param name="src"></param>
 /// <param name="dst"></param>
 /// <param name="width"></param>
 /// <param name="height"></param>
 /// <param name="r"></param>
 /// <param name="sigmaC"></param>
 /// <param name="sigmaD"></param>
 /// <returns></returns>
 __global__ void bilakernel_v2(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaD)
 {
 	int x = threadIdx.x + blockDim.x * blockIdx.x;
 	int y = threadIdx.y + blockDim.y * blockIdx.y;
 	if (x >= width || y >= height)return;
 	const int blocksize = (2 * r + threadX) * (2 * r + threadY);
 	const int id = x + y * width;
 	extern  __shared__ uchar3 box[];// (2 * threadX)* (2 * threadY)];
 	int n = 0;
 	while ((threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y) < (2 * r + blockDim.x) * (2 * r + blockDim.y))
 	{
 		int localIndex = threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y;
 		int x_temp = localIndex % (blockDim.x + 2 * r);
 		int y_temp = localIndex / (blockDim.x + 2 * r);
 		int x_real = x_temp - r + blockDim.x * blockIdx.x;
 		int y_real = y_temp - r + blockDim.y * blockIdx.y;
 		if (x_real < 0 || x_real >= width || y_real < 0 || y_real >= height)
 		{
 			box[localIndex].x = 0;
 			box[localIndex].y = 0;
 			box[localIndex].z = 0;
 		}
 		else
 		{
 			box[localIndex] = src[x_real + y_real * width];
 		}
 		n++;
 	}
 	__syncthreads();
 	double coeff = 0;
 	double sum[3] = { 0 };
 	int idCenter = r + threadIdx.x + (r + threadIdx.y) * (r * 2 + blockDim.y);
 	for (int i = -r; i <= r; i++)
 	{
 		for (int j = -r; j <= r; j++)
 		{
 			if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
 			{
 				int idCur = r + i + threadIdx.x + (r + j + threadIdx.y) * (r * 2 + blockDim.y);
 				double tmp = (
 					exp(-0.5 / sigmaC / sigmaC * (box[idCur].x - box[idCenter].x) * (box[idCur].x - box[idCenter].x)) +
 					exp(-0.5 / sigmaC / sigmaC * (box[idCur].y - box[idCenter].y) * (box[idCur].y - box[idCenter].y)) +
 					exp(-0.5 / sigmaC / sigmaC * (box[idCur].z - box[idCenter].z) * (box[idCur].z - box[idCenter].z))
 					) *
 					exp(-0.5 / sigmaD / sigmaD *
 						(i * i + j * j));
 				coeff += tmp;
 				sum[0] += tmp * box[idCur].x;
 				sum[1] += tmp * box[idCur].y;
 				sum[2] += tmp * box[idCur].z;
 			}
 		}
 	}
 	if (coeff > 0) {
 		dst[id].x = (uchar)(sum[0] / coeff);
 		dst[id].y = (uchar)(sum[1] / coeff);
 		dst[id].z = (uchar)(sum[2] / coeff);
 	}
 	else
 	{
 		dst[id].x = 0;
 		dst[id].y = 0;
 		dst[id].z = 0;
 	}
 }
 /**
 * \brief 模板类
 * \tparam N 共有N个流来加速
 */
 template <int N>
 class cuFilter
 {
 public:
 	cuFilter(dim3 g, dim3 b, int rows, int cols, int channels, int smSize) :grid(g), block(b), sharedMemSize(smSize)
 	{
 		for (int i = 0; i < N; i++) {
 			cudaStreamCreate(&streams[i]);
 			cudaMalloc(&srcmem[i], rows * cols * channels);
 			cudaMalloc(&dstmem[i], rows * cols * channels);
 		}
 		nFidx = 0;
 		inited = false;
 	}
 	~cuFilter()
 	{
 		for (int i = 0; i < N; i++)
 		{
 			cudaStreamSynchronize(streams[i]);
 			cudaFree(srcmem[i]);
 			cudaFree(dstmem[i]);
 		}
 	}
 	void filter(Mat src, Mat& dst, int r, double sigmaC, double sigmaD)
 	{
 		cudaMemcpy(srcmem[nFidx], src.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice);
 		bilakernel_v2 << <grid, block, sharedMemSize, streams[nFidx] >> > (srcmem[nFidx], dstmem[nFidx], src.rows, src.cols, r, sigmaC, sigmaD);
 		if (inited)
 		{
 			nFidx += 1;
 			if (nFidx == N) nFidx = 0;
 			cudaStreamSynchronize(streams[nFidx]);
 			src.copyTo(dst);
 			cudaMemcpy(dst.data, dstmem[nFidx], src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost);
 		}
 		else
 		{
 			nFidx++;
 			if (nFidx + 1 == N)
 			{
 				inited = true;
 			}
 		}
 	}
 	dim3 grid;
 	dim3 block;
 	cudaStream_t streams[N];
 	int nFidx;
 	int sharedMemSize;
 	bool inited;
 	uchar3* srcmem[N];
 	uchar3* dstmem[N];
 };
 int main()
 {
 	std::cout << "hello world" << std::endl;
 	cuFilter<64> filter(dim3(32, 32), dim3(16, 16), 512, 512, 3, 32 * 32 * 16);
 	Mat image = imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
 	Mat dst;
 	std::chrono::time_point<std::chrono::steady_clock> start, end;
 	start = std::chrono::high_resolution_clock::now();
 	for(int i = 0 ;i< 100; i++)
 	{
 		filter.filter(image, dst, 4, 15, 23);		
 	}
 	end = std::chrono::high_resolution_clock::now();
 	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
 	start = std::chrono::high_resolution_clock::now();
 	for(int i = 0; i< 100;i++)
 	{
 		bilateralFilter(image, dst, 8, 15, 23);
 	}
 	end = std::chrono::high_resolution_clock::now();
 	std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
 }