Browse Source

部分文件尚未添加,重新提交

master
zara 2 years ago
parent
commit
ccd93d0cfe
  1. 65
      mycu/bila.cu
  2. 60
      mycu/kernel.cu
  3. 91
      mycu/mycu.vcxproj

65
mycu/bila.cu

@ -80,11 +80,25 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i @@ -80,11 +80,25 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i
if (x >= width || y >= height)return;
int id = x + y * width;
__shared__ uchar3 box[(2 * threadX) * (2 * threadY)];
const int blocksize = (2 * r + threadX) * (2 * r + threadY);
const int id = x + y * width;
extern __shared__ uchar3 box[];// (2 * threadX)* (2 * threadY)];
int i = 0;
while(i* blockDim.x*blockDim.y+threadIdx.x+threadIdx.y*blockDim.x<blocksize)
{
int localindex = i * blockDim.x * blockDim.y + threadIdx.x + threadIdx.y * blockDim.x;
i++;
}
for(int i = 0; i< 999999;i++)
{
int localIndex = threadIdx.x + threadIdx.y * blockIdx.x + i*blockIdx.x*blockIdx.y;
int localIndex = threadIdx.x + threadIdx.y * blockDim.x + i * blockDim.x * blockDim.y;
if(localIndex<(2*r+blockDim.x)*(2*r+blockDim.y))
{
int x_temp = localIndex % (blockDim.x + 2 * r);
@ -93,7 +107,7 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i @@ -93,7 +107,7 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i
// x_temp+r+blockDim.x*(blockIdx.x+1)-1>width)
int x_real = x_temp - r + blockDim.x*blockIdx.x;
int y_real = y_temp - r + blockDim.y * blockIdx.y;
if(x_real<0||x_real>=width||y_real<0||y>=height)
if(x_real<0||x_real>=width||y_real<0||y_real>=height)
{
box[localIndex].x = 0;
box[localIndex].y = 0;
@ -101,7 +115,10 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i @@ -101,7 +115,10 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i
}
else
{
box[localIndex] = src[x_temp + y_temp * width];
box[localIndex] = src[x_real + y_real * width];
// printf("id = %d, localid = %d, dstid = %d\n", id, localIndex, x_temp + y_temp * width);
// if (blockIdx.x == 0 && blockIdx.y == 0) {
// }
}
}
else
@ -109,30 +126,38 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i @@ -109,30 +126,38 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i
break;
}
}
// __threadsync();
__syncthreads();
// printf("src[%d] = [%d,%d,%d],localindex = %d boxcenter = [%d,%d,%d]\n", id,localIndex, src[id].x, src[id].y, src[id].z, box[r+r * (2*r + blockDim.x)].x, box[r + r * (2 * r + blockDim.x)].y, box[r + r * (2 * r + blockDim.x)].z);
double coeff = 0;
double sum[3] = { 0 };
int idCenter = r + threadIdx.x + (r+threadIdx.y) * ( r*2 + blockDim.y);
for(int i = -r; i<=r;i++)
{
for(int j = -r;j<=r;j++)
{
if(x+i>=0&&x+i<width&&y+j>=0&&y+j<height)
{
int idCur = r + i + threadIdx.x + (r + j + threadIdx.y) * (r * 2 + blockDim.y);
double tmp = (
exp(-0.5 / sigmaC / sigmaC * (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].x)* (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].x)) +
exp(-0.5 / sigmaC / sigmaC * (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].y)* (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].y))+
exp(-0.5 / sigmaC / sigmaC * (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].z)* (box[i + r + (j + r) * (2 * r + blockDim.x)].x - box[r + r * (2 * r + blockDim.x)].z))
exp(-0.5 / sigmaC / sigmaC * (box[idCur].x - box[idCenter].x) * (box[idCur].x - box[idCenter].x)) +
exp(-0.5 / sigmaC / sigmaC * (box[idCur].y - box[idCenter].y) * (box[idCur].y - box[idCenter].y)) +
exp(-0.5 / sigmaC / sigmaC * (box[idCur].z - box[idCenter].z) * (box[idCur].z - box[idCenter].z))
) *
exp(-0.5 / sigmaD / sigmaD *
(i * i + j * j));
coeff += tmp;
sum[0] += tmp * box[i * r + (j + r) * (2 * r + blockDim.x)].x;
sum[1] += tmp * box[i * r + (j + r) * (2 * r + blockDim.x)].y;
sum[2] += tmp * box[i * r + (j + r) * (2 * r + blockDim.x)].z;
sum[0] += tmp * box[idCur].x;
sum[1] += tmp * box[idCur].y;
sum[2] += tmp * box[idCur].z;
}
}
}
// printf("sum[%d] = [%f,%f,%f] coeff = %f\n", id,sum[0], sum[1], sum[2],coeff);
if (coeff > 0) {
dst[id].x = (uchar)(sum[0] / coeff);
dst[id].y = (uchar)(sum[1] / coeff);
@ -145,10 +170,12 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i @@ -145,10 +170,12 @@ __global__ void bilakernel_v2(uchar3 * src, uchar3* dst,int width,int height, i
dst[id].z = 0;
}
if (x == 0 && y == 0)
{
printf("\n\n\n\n\n\n\n\n/////////////// on gpu /////////////////// \n\n grid dim = (%d, %d) blockdim = (%d,%d)\n\n\n\n\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
}
// printf("dst[%d] = [%d,%d,%d]\n", id, dst[id].x, dst[id].y, dst[id].z);
// if (x == 50 && y == 50)
// {
// printf("\n\n\n\n\n\n\n\n/////////////// on gpu /////////////////// \n\n grid dim = (%d, %d) blockdim = (%d,%d)\n\n\n\n\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
// printf("////box index///\n\n box[%d,%d] = %d,%d,%d\n\n\n", x, y, box[r * (2 * r + blockDim.x)].x, box[r * (2 * r + blockDim.x)].y, box[r * (2 * r + blockDim.x)].z);
// }
}
@ -173,8 +200,12 @@ void mycu::bilateralFilter(cv::Mat src, cv::Mat dst, int r, double sigmaC, doub @@ -173,8 +200,12 @@ void mycu::bilateralFilter(cv::Mat src, cv::Mat dst, int r, double sigmaC, doub
cudaMemcpy(srcGpu, src.data, 3 * width * height, cudaMemcpyHostToDevice);
bilakernel << <dim3((width - 1) / threadX + 1, (height - 1) / threadY + 1), dim3(threadX, threadY) >> > (srcGpu, dstGpu, width, height, r, sigmaC, sigmaS);
// bilakernel << <dim3((width - 1) / threadX + 1, (height - 1) / threadY + 1), dim3(threadX, threadY) >> > (srcGpu, dstGpu, width, height, r, sigmaC, sigmaS);
bilakernel_v2 <<<dim3((width - 1) / threadX + 1, (height - 1) / threadY + 1), dim3(threadX, threadY),(threadX+2*r)*(threadY+2*r)*4 >>> (srcGpu, dstGpu, width, height, r, sigmaC, sigmaS);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
std::cout << "err code = " << err << ", msg = " << cudaGetErrorString(err);
cudaMemcpy(dst.data, dstGpu, 3 * width * height, cudaMemcpyDeviceToHost);
cudaFree(srcGpu);

60
mycu/kernel.cu

@ -12,38 +12,41 @@ @@ -12,38 +12,41 @@
#include <opencv2/opencv.hpp>
#include "../cudaUtils/utils.h"
using namespace cv;
class hdTimer
{
public:
hdTimer()
{
QueryPerformanceFrequency(&freq);
}
void tik()
{
QueryPerformanceCounter(&start);
}
void tok()
{
QueryPerformanceCounter(&end);
}
double cost()
{
return double(end.QuadPart - start.QuadPart) / double(freq.QuadPart);
}
private:
LARGE_INTEGER start, end;
LARGE_INTEGER freq;
using namespace cv;
};
// class hdTimer
// {
// public:
// hdTimer()
// {
// QueryPerformanceFrequency(&freq);
// }
// void tik()
// {
// QueryPerformanceCounter(&start);
// }
// void tok()
// {
// QueryPerformanceCounter(&end);
// }
// double cost()
// {
// return double(end.QuadPart - start.QuadPart) / double(freq.QuadPart);
// }
//
// private:
// LARGE_INTEGER start, end;
// LARGE_INTEGER freq;
//
//
// };
@ -54,13 +57,14 @@ int main() @@ -54,13 +57,14 @@ int main()
Mat dst = Mat(src.rows, src.cols, src.type());
dst.data = (unsigned char*)malloc(3 * src.rows * src.cols);
hdTimer* t0 = new hdTimer();
::utils::hdTimer* t0 = new ::utils::hdTimer();
t0->tik();
mycu::bilateralFilter(src, dst, 4, 12, 25);
mycu::bilateralFilter(src, dst, 6, 12, 25);
t0->tok();
Mat bila;
hdTimer* t1 = new hdTimer();
::utils::hdTimer* t1 = new ::utils::hdTimer();
t1->tik();
bilateralFilter(src, bila, 9, 12, 25);
t1->tok();

91
mycu/mycu.vcxproj

@ -0,0 +1,91 @@ @@ -0,0 +1,91 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{7EC31CF5-65E6-48E8-8452-C97C8FAF891C}</ProjectGuid>
<RootNamespace>mycu</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>$(SolutionPath)/cudaUtils;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bila.cu" />
<CudaCompile Include="kernel.cu" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="bila.cuh" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.targets" />
</ImportGroup>
</Project>
Loading…
Cancel
Save