Browse Source

init

master
zara 2 years ago
commit
4659eb96ec
  1. 25
      cuBilaStream.sln
  2. 274
      cuBilaStream/kernel.cu

25
cuBilaStream.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.33627.172
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuBilaStream", "cuBilaStream\cuBilaStream.vcxproj", "{75C221AE-D06F-4181-866E-F78F00547173}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.ActiveCfg = Debug|x64
{75C221AE-D06F-4181-866E-F78F00547173}.Debug|x64.Build.0 = Debug|x64
{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.ActiveCfg = Release|x64
{75C221AE-D06F-4181-866E-F78F00547173}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {C986B33D-AD9E-4EE4-BD4C-C49A516FCF54}
EndGlobalSection
EndGlobal

274
cuBilaStream/kernel.cu

@ -0,0 +1,274 @@ @@ -0,0 +1,274 @@

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <opencv2/opencv.hpp>
#include <chrono>
using namespace cv;
#define threadX 16
#define threadY 16
#define threadZ 1
/// <summary>
/// 双边滤波的核函数,边界以0填充
/// </summary>
/// <param name="src"></param>
/// <param name="dst"></param>
/// <param name="width"></param>
/// <param name="height"></param>
/// <param name="r"></param>
/// <param name="sigmaC"></param>
/// <param name="sigmaS"></param>
/// <returns></returns>
__global__ void bilakernel(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaS)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x >= width || y >= height)return;
double mask = 0;
double3 result;
// mask.x = 0;
// mask.y = 0;
// mask.z = 0;
result.x = 0;
result.y = 0;
result.z = 0;
uchar3 origin = src[x + y * width];
for (int i = -r; i <= r; i++)
{
for (int j = -r; j <= r; j++)
{
if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
{
uchar3 tmp = src[x + i + (y + j) * width];
int3 sub;
sub.x = tmp.x - origin.x;
sub.y = tmp.y - origin.y;
sub.z = tmp.z - origin.z;
double masktmp = (exp(-0.5 / sigmaC / sigmaC * sub.x * sub.x) +
exp(-0.5 / sigmaC / sigmaC * sub.y * sub.y) +
exp(-0.5 / sigmaC / sigmaC * sub.z * sub.z)) *
exp(-0.5 / sigmaS / sigmaS * (i * i + j * j));
mask += masktmp;
result.x += tmp.x * masktmp;
result.y += tmp.y * masktmp;
result.z += tmp.z * masktmp;
}
}
}
if (mask > 0)
{
dst[x + y * width].x = result.x / mask;
dst[x + y * width].y = result.y / mask;
dst[x + y * width].z = result.z / mask;
}
else
{
dst[x + y * width].x = 0;
dst[x + y * width].y = 0;
dst[x + y * width].z = 0;
}
}
/// <summary>
/// 双边滤波的核函数,通过共享内存加速
/// </summary>
/// <param name="src"></param>
/// <param name="dst"></param>
/// <param name="width"></param>
/// <param name="height"></param>
/// <param name="r"></param>
/// <param name="sigmaC"></param>
/// <param name="sigmaD"></param>
/// <returns></returns>
__global__ void bilakernel_v2(uchar3* src, uchar3* dst, int width, int height, int r, double sigmaC, double sigmaD)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x >= width || y >= height)return;
const int blocksize = (2 * r + threadX) * (2 * r + threadY);
const int id = x + y * width;
extern __shared__ uchar3 box[];// (2 * threadX)* (2 * threadY)];
int n = 0;
while ((threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y) < (2 * r + blockDim.x) * (2 * r + blockDim.y))
{
int localIndex = threadIdx.x + threadIdx.y * blockDim.x + n * blockDim.x * blockDim.y;
int x_temp = localIndex % (blockDim.x + 2 * r);
int y_temp = localIndex / (blockDim.x + 2 * r);
int x_real = x_temp - r + blockDim.x * blockIdx.x;
int y_real = y_temp - r + blockDim.y * blockIdx.y;
if (x_real < 0 || x_real >= width || y_real < 0 || y_real >= height)
{
box[localIndex].x = 0;
box[localIndex].y = 0;
box[localIndex].z = 0;
}
else
{
box[localIndex] = src[x_real + y_real * width];
}
n++;
}
__syncthreads();
double coeff = 0;
double sum[3] = { 0 };
int idCenter = r + threadIdx.x + (r + threadIdx.y) * (r * 2 + blockDim.y);
for (int i = -r; i <= r; i++)
{
for (int j = -r; j <= r; j++)
{
if (x + i >= 0 && x + i < width && y + j >= 0 && y + j < height)
{
int idCur = r + i + threadIdx.x + (r + j + threadIdx.y) * (r * 2 + blockDim.y);
double tmp = (
exp(-0.5 / sigmaC / sigmaC * (box[idCur].x - box[idCenter].x) * (box[idCur].x - box[idCenter].x)) +
exp(-0.5 / sigmaC / sigmaC * (box[idCur].y - box[idCenter].y) * (box[idCur].y - box[idCenter].y)) +
exp(-0.5 / sigmaC / sigmaC * (box[idCur].z - box[idCenter].z) * (box[idCur].z - box[idCenter].z))
) *
exp(-0.5 / sigmaD / sigmaD *
(i * i + j * j));
coeff += tmp;
sum[0] += tmp * box[idCur].x;
sum[1] += tmp * box[idCur].y;
sum[2] += tmp * box[idCur].z;
}
}
}
if (coeff > 0) {
dst[id].x = (uchar)(sum[0] / coeff);
dst[id].y = (uchar)(sum[1] / coeff);
dst[id].z = (uchar)(sum[2] / coeff);
}
else
{
dst[id].x = 0;
dst[id].y = 0;
dst[id].z = 0;
}
}
/**
* \brief 模板类
* \tparam N 共有N个流来加速
*/
template <int N>
class cuFilter
{
public:
cuFilter(dim3 g, dim3 b, int rows, int cols, int channels, int smSize) :grid(g), block(b), sharedMemSize(smSize)
{
for (int i = 0; i < N; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&srcmem[i], rows * cols * channels);
cudaMalloc(&dstmem[i], rows * cols * channels);
}
nFidx = 0;
inited = false;
}
~cuFilter()
{
for (int i = 0; i < N; i++)
{
cudaStreamSynchronize(streams[i]);
cudaFree(srcmem[i]);
cudaFree(dstmem[i]);
}
}
void filter(Mat src, Mat& dst, int r, double sigmaC, double sigmaD)
{
cudaMemcpy(srcmem[nFidx], src.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice);
bilakernel_v2 << <grid, block, sharedMemSize, streams[nFidx] >> > (srcmem[nFidx], dstmem[nFidx], src.rows, src.cols, r, sigmaC, sigmaD);
if (inited)
{
nFidx += 1;
if (nFidx == N) nFidx = 0;
cudaStreamSynchronize(streams[nFidx]);
src.copyTo(dst);
cudaMemcpy(dst.data, dstmem[nFidx], src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost);
}
else
{
nFidx++;
if (nFidx + 1 == N)
{
inited = true;
}
}
}
dim3 grid;
dim3 block;
cudaStream_t streams[N];
int nFidx;
int sharedMemSize;
bool inited;
uchar3* srcmem[N];
uchar3* dstmem[N];
};
int main()
{
std::cout << "hello world" << std::endl;
cuFilter<64> filter(dim3(32, 32), dim3(16, 16), 512, 512, 3, 32 * 32 * 16);
Mat image = imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png");
Mat dst;
std::chrono::time_point<std::chrono::steady_clock> start, end;
start = std::chrono::high_resolution_clock::now();
for(int i = 0 ;i< 100; i++)
{
filter.filter(image, dst, 4, 15, 23);
}
end = std::chrono::high_resolution_clock::now();
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
start = std::chrono::high_resolution_clock::now();
for(int i = 0; i< 100;i++)
{
bilateralFilter(image, dst, 8, 15, 23);
}
end = std::chrono::high_resolution_clock::now();
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl;
}
Loading…
Cancel
Save