You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
134 lines
4.7 KiB
134 lines
4.7 KiB
|
|
#include "cuda_runtime.h" |
|
#include "device_launch_parameters.h" |
|
|
|
#include <math.h> |
|
#include <stdio.h> |
|
#include <opencv2/opencv.hpp> |
|
|
|
|
|
__global__ void kernelR(double3* src, uchar3* dst, double* maskhALL, double* maskvALL, int height, int width, float sigma_s, float sigma_r, int no_of_iter) |
|
{ |
|
int id = threadIdx.y * blockDim.x + threadIdx.x; |
|
double* maskh = maskhALL + height * id; |
|
double* maskv = maskvALL + width * id; |
|
|
|
if (id < height) |
|
{ |
|
for (int i = 0; i < width - 1; i++) |
|
maskv[i] = 1 + sigma_s / sigma_r * (abs(src[id * width + i + 1].x - src[id * width + i].x) + |
|
abs(src[id * width + i + 1].y - src[id * width + i].y) + |
|
abs(src[id * width + i + 1].z - src[id * width + i].z)); |
|
maskv[width - 1] = 1; |
|
} |
|
if (id < width) |
|
{ |
|
for (int j = 0; j < height - 1; j++) |
|
maskh[j] = 1 + sigma_s / sigma_r * (abs(src[(j + 1) * width + id].x - src[j * width + id].x) + |
|
abs(src[(j + 1) * width + id].y - src[j * width + id].y) + |
|
abs(src[(j + 1) * width + id].z - src[j * width + id].z)); |
|
maskh[height - 1] = 1; |
|
} |
|
// __syncthreads(); |
|
// printf("id = %d\n", id); |
|
|
|
|
|
for (int i = 0; i < no_of_iter; i++) |
|
{ |
|
float sigma_h = sigma_s * sqrt(3.0) * pow(2.0, (no_of_iter - (i + 1))) / sqrt(pow(4.0, no_of_iter) - 1); |
|
float a = (float)exp((-1.0 * sqrt(2.0)) / sigma_h); |
|
|
|
|
|
if (id < height) |
|
for (int j = 1; j < width; j++) |
|
{ |
|
src[id * width + j].x += (src[id * width + j - 1].x - src[id * width + j].x) * pow(a, maskv[j]); |
|
src[id * width + j].y += (src[id * width + j - 1].y - src[id * width + j].y) * pow(a, maskv[j]); |
|
src[id * width + j].z += (src[id * width + j - 1].z - src[id * width + j].z) * pow(a, maskv[j]); |
|
} |
|
|
|
///test |
|
|
|
__syncthreads(); |
|
if (id < height) |
|
for (int j = width - 2; j >= 0; j--) |
|
{ |
|
src[id * width + j].x += (src[id * width + j + 1].x - src[id * width + j].x) * pow(a, maskv[j]); |
|
src[id * width + j].y += (src[id * width + j + 1].y - src[id * width + j].y) * pow(a, maskv[j]); |
|
src[id * width + j].z += (src[id * width + j + 1].z - src[id * width + j].z) * pow(a, maskv[j]); |
|
} |
|
|
|
__syncthreads(); |
|
if (id < width) |
|
for (int j = 1; j < height; j++) |
|
{ |
|
src[j * width + id].x += (src[(j - 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]); |
|
src[j * width + id].y += (src[(j - 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]); |
|
src[j * width + id].z += (src[(j - 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]); |
|
} |
|
__syncthreads(); |
|
if (id < width) |
|
for (int j = height - 2; j >= 0; j--) |
|
{ |
|
src[j * width + id].x += (src[(j + 1) * width + id].x - src[j * width + id].x) * pow(a, maskh[j]); |
|
src[j * width + id].y += (src[(j + 1) * width + id].y - src[j * width + id].y) * pow(a, maskh[j]); |
|
src[j * width + id].z += (src[(j + 1) * width + id].z - src[j * width + id].z) * pow(a, maskh[j]); |
|
} |
|
__syncthreads(); |
|
} |
|
if (id < height) |
|
for (int j = 0; j < width; j++) |
|
{ |
|
dst[id * width + j].x = src[id * width + j].x > 1 ? 255 : (uchar)(src[id * width + j].x * 255.0); |
|
dst[id * width + j].y = src[id * width + j].y > 1 ? 255 : (uchar)(src[id * width + j].y * 255.0); |
|
dst[id * width + j].z = src[id * width + j].z > 1 ? 255 : (uchar)(src[id * width + j].z * 255.0); |
|
|
|
} |
|
} |
|
|
|
|
|
|
|
int main() |
|
{ |
|
cudaError_t err; |
|
|
|
cv::Mat image = cv::imread("D:/opencv/modules/core/misc/objc/test/resources/lena.png"); |
|
cv::Mat dst = cv::Mat(image.rows, image.cols, CV_8UC3); |
|
double3* src_GPU; |
|
uchar3* dst_GPU; |
|
cudaMalloc(&src_GPU, image.rows * image.cols * 3 * 8); |
|
cudaMalloc(&dst_GPU, image.rows * image.cols * 3); |
|
|
|
double* maskv; |
|
double* maskh; |
|
|
|
cudaMalloc(&maskv, sizeof(double) * image.cols * image.rows); |
|
cudaMalloc(&maskh, sizeof(double) * image.cols * image.rows); |
|
|
|
|
|
std::chrono::time_point<std::chrono::steady_clock> start, end; |
|
start = std::chrono::high_resolution_clock::now(); |
|
cv::Mat temp; |
|
image.convertTo(temp, CV_64FC3, 1.0 / 255.0); |
|
cudaMemcpy(src_GPU, temp.data, image.rows * image.cols * 3 * 8, cudaMemcpyHostToDevice); |
|
kernelR << <1, dim3(32, 32) >> > (src_GPU, dst_GPU, maskv, maskh, image.rows, image.cols, 43, 0.7, 3); |
|
cudaDeviceSynchronize(); |
|
cudaMemcpy(dst.data, dst_GPU, dst.rows * dst.cols * 3, cudaMemcpyDeviceToHost); |
|
end = std::chrono::high_resolution_clock::now(); |
|
std::cout << "gpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl; |
|
|
|
|
|
start = std::chrono::high_resolution_clock::now(); |
|
cv::Mat edge; |
|
cv::edgePreservingFilter(image, edge, 1, 43, 0.7); |
|
end = std::chrono::high_resolution_clock::now(); |
|
std::cout << "cpu cost " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns " << std::endl; |
|
|
|
cv::imshow("cpu", edge); |
|
cv::imshow("gpu ", dst); |
|
cv::imshow("sub", dst - edge); |
|
cv::imshow("sub2", edge - dst); |
|
|
|
cv::waitKey(0); |
|
return 0; |
|
} |