From 6329974d4dd23f736f5cea6145ecc245d7e53940 Mon Sep 17 00:00:00 2001 From: Troels Ynddal Date: Thu, 19 Jun 2025 15:38:53 +0200 Subject: [PATCH] Merge pull request #3943 from troelsy:4.x Add Otsu's method to cv::cuda::threshold #3943 I implemented Otsu's method in CUDA for a separate project and want to add it to cv::cuda::threshold I have made an effort to use existing OpenCV functions in my code, but I had some trouble with `ThresholdTypes` and `cv::cuda::calcHist`. I couldn't figure out how to include `precomp.hpp` to get the definition of `ThresholdTypes`. For `cv::cuda::calcHist` I tried adding `opencv_cudaimgproc`, but it creates a circular dependency on `cudaarithm`. I have include a simple implementation of `calcHist` so the code runs, but I would like input on how to use `cv::cuda::calcHist` instead. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- .../cudaarithm/include/opencv2/cudaarithm.hpp | 10 +- modules/cudaarithm/src/cuda/threshold.cu | 222 +++++++++++++++++- .../test/test_element_operations.cpp | 50 +++- .../opencv2/cudev/util/saturate_cast.hpp | 2 + .../include/opencv2/cudev/warp/shuffle.hpp | 10 + 5 files changed, 289 insertions(+), 5 deletions(-) diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index e5ab2cf25..80b4dd8fa 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -546,12 +546,16 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp /** @brief Applies a fixed-level threshold to each array element. +The special value cv::THRESH_OTSU may be combined with one of the other types. In this case, the function determines the +optimal threshold value using the Otsu's and uses it instead of the specified threshold. The function returns the +computed threshold value in addititon to the thresholded matrix. +The Otsu's method is implemented only for 8-bit matrices. + @param src Source array (single-channel). -@param dst Destination array with the same size and type as src . +@param dst Destination array with the same size and type as src. @param thresh Threshold value. @param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types. -@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE -threshold types are not supported. +@param type Threshold type. For details, see threshold. The THRESH_TRIANGLE threshold type is not supported. @param stream Stream for the asynchronous version. @sa threshold diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index cfa88dfae..c5586e61a 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -95,12 +95,232 @@ namespace } } -double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream) + +__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums) +{ + const uint32_t n_bins = 256; + + __shared__ uint shared_memory_ts[n_bins]; + __shared__ unsigned long long shared_memory_s[n_bins]; + + int bin_idx = threadIdx.x; + int threshold = blockIdx.x; + + uint threshold_sum_above = 0; + unsigned long long sum_above = 0; + + if (bin_idx >= threshold) + { + uint value = histogram[bin_idx]; + threshold_sum_above = value; + sum_above = value * bin_idx; + } + + blockReduce(shared_memory_ts, threshold_sum_above, bin_idx, plus()); + blockReduce(shared_memory_s, sum_above, bin_idx, plus()); + + if (bin_idx == 0) + { + threshold_sums[threshold] = threshold_sum_above; + sums[threshold] = sum_above; + } +} + +__global__ void +otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums) +{ + const uint32_t n_bins = 256; + + __shared__ signed long long shared_memory_a[n_bins]; + __shared__ signed long long shared_memory_b[n_bins]; + + int bin_idx = threadIdx.x; + int threshold = blockIdx.x; + + uint n_samples = threshold_sums[0]; + uint n_samples_above = threshold_sums[threshold]; + uint n_samples_below = n_samples - n_samples_above; + + unsigned long long total_sum = sums[0]; + unsigned long long sum_above = sums[threshold]; + unsigned long long sum_below = total_sum - sum_above; + + float threshold_variance_above_f32 = 0; + float threshold_variance_below_f32 = 0; + if (bin_idx >= threshold) + { + float mean = (float) sum_above / n_samples_above; + float sigma = bin_idx - mean; + threshold_variance_above_f32 = sigma * sigma; + } + else + { + float mean = (float) sum_below / n_samples_below; + float sigma = bin_idx - mean; + threshold_variance_below_f32 = sigma * sigma; + } + + uint bin_count = histogram[bin_idx]; + signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count); + signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count); + blockReduce(shared_memory_a, threshold_variance_above_i64, bin_idx, plus()); + blockReduce(shared_memory_b, threshold_variance_below_i64, bin_idx, plus()); + + if (bin_idx == 0) + { + variance[threshold] = make_float2(threshold_variance_above_i64, threshold_variance_below_i64); + } +} + + +__global__ void +otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance) +{ + const uint32_t n_thresholds = 256; + + __shared__ float shared_memory[n_thresholds / WARP_SIZE]; + + int threshold = threadIdx.x; + + uint n_samples = threshold_sums[0]; + uint n_samples_above = threshold_sums[threshold]; + uint n_samples_below = n_samples - n_samples_above; + + float threshold_mean_above = (float)n_samples_above / n_samples; + float threshold_mean_below = (float)n_samples_below / n_samples; + + float2 variances = variance[threshold]; + float variance_above = variances.x / n_samples_above; + float variance_below = variances.y / n_samples_below; + + float above = threshold_mean_above * variance_above; + float below = threshold_mean_below * variance_below; + float score = above + below; + + float original_score = score; + + blockReduce(shared_memory, score, threshold, minimum()); + + if (threshold == 0) + { + shared_memory[0] = score; + } + __syncthreads(); + + score = shared_memory[0]; + + // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we + // know which threshold it is + if (original_score == score) + { + *otsu_threshold = threshold - 1; + } +} + +void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream) +{ + const uint n_bins = 256; + const uint n_thresholds = 256; + + cudaStream_t cuda_stream = StreamAccessor::getStream(stream); + + dim3 block_all(n_bins); + dim3 grid_all(n_thresholds); + dim3 block_score(n_thresholds); + dim3 grid_score(1); + + BufferPool pool(stream); + GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator()); + GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator()); + GpuMat gpu_variances(1, n_bins, CV_32FC2, pool.getAllocator()); + + otsu_sums<<>>( + histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_variance<<>>( + gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_score<<>>( + otsu_threshold, gpu_threshold_sums.ptr(), gpu_variances.ptr()); +} + +// TODO: Replace this is cv::cuda::calcHist +template +__global__ void histogram_kernel( + uint *histogram, const uint8_t *image, uint width, + uint height, uint pitch) +{ + __shared__ uint local_histogram[n_bins]; + + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < n_bins) + { + local_histogram[tid] = 0; + } + + __syncthreads(); + + if (x < width && y < height) + { + uint8_t value = image[y * pitch + x]; + atomicInc(&local_histogram[value], 0xFFFFFFFF); + } + + __syncthreads(); + + if (tid < n_bins) + { + cv::cudev::atomicAdd(&histogram[tid], local_histogram[tid]); + } +} + +// TODO: Replace this with cv::cuda::calcHist +void calcHist( + const GpuMat src, GpuMat histogram, Stream stream) +{ + const uint n_bins = 256; + + cudaStream_t cuda_stream = StreamAccessor::getStream(stream); + + dim3 block(128, 4, 1); + dim3 grid = dim3(divUp(src.cols, block.x), divUp(src.rows, block.y), 1); + CV_CUDEV_SAFE_CALL(cudaMemsetAsync(histogram.ptr(), 0, n_bins * sizeof(uint), cuda_stream)); + histogram_kernel + <<>>( + histogram.ptr(), src.ptr(), (uint) src.cols, (uint) src.rows, (uint) src.step); +} + +double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream) { GpuMat src = getInputMat(_src, stream); const int depth = src.depth(); + const int THRESH_OTSU = 8; + if ((type & THRESH_OTSU) == THRESH_OTSU) + { + CV_Assert(depth == CV_8U); + CV_Assert(src.channels() == 1); + + BufferPool pool(stream); + + // Find the threshold using Otsu and then run the normal thresholding algorithm + GpuMat gpu_histogram(256, 1, CV_32SC1, pool.getAllocator()); + calcHist(src, gpu_histogram, stream); + + GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator()); + compute_otsu(gpu_histogram.ptr(), gpu_otsu_threshold.ptr(), stream); + + cv::Mat mat_otsu_threshold; + gpu_otsu_threshold.download(mat_otsu_threshold, stream); + stream.waitForCompletion(); + + // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type + type = type & ~THRESH_OTSU; + thresh = (double) mat_otsu_threshold.at(0); + } + CV_Assert( depth <= CV_64F ); CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ ); diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp index a15ad7b3e..f3ce3b520 100644 --- a/modules/cudaarithm/test/test_element_operations.cpp +++ b/modules/cudaarithm/test/test_element_operations.cpp @@ -2529,7 +2529,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // Threshold -CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV) +CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV, cv::THRESH_OTSU) #define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV)) PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi) @@ -2577,6 +2577,54 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine( ALL_THRESH_OPS, WHOLE_SUBMAT)); +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// ThresholdOtsu + +PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + int type; + int channel; + int threshOp; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + type = GET_PARAM(2); + channel = GET_PARAM(3); + threshOp = GET_PARAM(4) | cv::THRESH_OTSU; + useRoi = GET_PARAM(5); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(ThresholdOtsu, Accuracy) +{ + cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel)); + + cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi); + double otsu_gpu = cv::cuda::threshold(loadMat(src, useRoi), dst, 0, 255, threshOp); + + cv::Mat dst_gold; + double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp); + + ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu); + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values(MatDepth(CV_8U)), + testing::Values(Channels(1)), + ALL_THRESH_OPS, + WHOLE_SUBMAT)); + + //////////////////////////////////////////////////////////////////////////////// // InRange diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp index 59fd8da45..c256e7d90 100644 --- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp +++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp @@ -62,6 +62,8 @@ template __device__ __forceinline__ T saturate_cast(ushort v) { ret template __device__ __forceinline__ T saturate_cast(short v) { return T(v); } template __device__ __forceinline__ T saturate_cast(uint v) { return T(v); } template __device__ __forceinline__ T saturate_cast(int v) { return T(v); } +template __device__ __forceinline__ T saturate_cast(signed long long v) { return T(v); } +template __device__ __forceinline__ T saturate_cast(unsigned long long v) { return T(v); } template __device__ __forceinline__ T saturate_cast(float v) { return T(v); } template __device__ __forceinline__ T saturate_cast(double v) { return T(v); } diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp index f00d1f850..0de5351ff 100644 --- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp @@ -332,6 +332,16 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp return (uint) __shfl_down((int) val, delta, width); } +__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize) +{ + return __shfl_down(val, delta, width); +} + +__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize) +{ + return (unsigned long long) __shfl_down(val, delta, width); +} + __device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize) { return __shfl_down(val, delta, width);