You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
240 lines
9.6 KiB
240 lines
9.6 KiB
// MIT License |
|
// |
|
// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. |
|
// |
|
// Permission is hereby granted, free of charge, to any person obtaining a copy |
|
// of this software and associated documentation files (the "Software"), to deal |
|
// in the Software without restriction, including without limitation the rights |
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
// copies of the Software, and to permit persons to whom the Software is |
|
// furnished to do so, subject to the following conditions: |
|
// |
|
// The above copyright notice and this permission notice shall be included in all |
|
// copies or substantial portions of the Software. |
|
// |
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
// SOFTWARE. |
|
|
|
#include "cmdparser.hpp" |
|
#include "example_utils.hpp" |
|
|
|
#include <hip/hip_runtime.h> |
|
|
|
#include <algorithm> |
|
#include <iostream> |
|
#include <random> |
|
#include <string> |
|
#include <string_view> |
|
|
|
/// \brief Given an array of n elements, this kernel implements the j-th stage within the i-th |
|
/// step of the bitonic sort, being 0 <= i < log_2(n) and 0 <= j <= i. |
|
__global__ void bitonic_sort_kernel(unsigned int* array, |
|
const unsigned int step, |
|
const unsigned int stage, |
|
bool sort_increasing) |
|
{ |
|
// Current thread id. |
|
unsigned int thread_id = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
// How many pairs of elements are ordered with the same criteria (increasingly or decreasingly) |
|
// within each of the bitonic subsequences computed in each step. E.g. in the step 0 we have |
|
// 1 pair of elements in each monotonic component of the bitonic subsequences, that is, we |
|
// obtain bitonic sequences of length 4. |
|
const unsigned int same_order_block_width = 1 << step; |
|
|
|
// Distance between the two elements that each thread sorts. |
|
const unsigned int pair_distance = 1 << (step - stage); |
|
|
|
// Total number of elements of each subsequence processed. |
|
const unsigned int sorted_block_width = 2 * pair_distance; |
|
|
|
// Compute indexes of the elements of the array that the thread will sort. |
|
const unsigned int left_id |
|
= (thread_id % pair_distance) + (thread_id / pair_distance) * sorted_block_width; |
|
const unsigned int right_id = left_id + pair_distance; |
|
|
|
// Get the elements of the array that the thread will sort. |
|
const unsigned int left_element = array[left_id]; |
|
const unsigned int right_element = array[right_id]; |
|
|
|
// If the current thread is the first one ordering an element from the right component of the |
|
// bitonic sequence that it's computing, then the ordering criteria changes. |
|
if((thread_id / same_order_block_width) % 2 == 1) |
|
sort_increasing = !sort_increasing; |
|
|
|
// Compare elements and switch them if necessary. |
|
const unsigned int greater = (left_element > right_element) ? left_element : right_element; |
|
const unsigned int lesser = (left_element > right_element) ? right_element : left_element; |
|
array[left_id] = (sort_increasing) ? lesser : greater; |
|
array[right_id] = (sort_increasing) ? greater : lesser; |
|
} |
|
|
|
/// \brief Swaps two elements if the first is greater than the second. |
|
void swap_if_first_greater(unsigned int* a, unsigned int* b) |
|
{ |
|
if(*a > *b) |
|
{ |
|
std::swap(*a, *b); |
|
} |
|
} |
|
|
|
/// \brief Reference CPU implementation of the bitonic sort for results verification. |
|
void bitonic_sort_reference(unsigned int* array, |
|
const unsigned int length, |
|
const bool sort_increasing) |
|
{ |
|
const unsigned int half_length = length / 2; |
|
|
|
// For each step i' = log_2(i) - 1, 0 <= i' < log_2(length). |
|
for(unsigned int i = 2; i <= length; i *= 2) |
|
{ |
|
// For each stage j' = log_2(i / j), 0 <= j' <= i'. |
|
for(unsigned int j = i; j > 1; j /= 2) |
|
{ |
|
bool increasing = sort_increasing; |
|
const unsigned int half_j = j / 2; |
|
|
|
// Sort elements separated by distance j / 2. |
|
for(unsigned int k = 0; k < length; k += j) |
|
{ |
|
const unsigned int k_plus_half_j = k + half_j; |
|
|
|
// Each time we sort i elements we must change the ordering direction. |
|
if((k == i) || ((i < length) && (k % i) == 0 && (k != half_length))) |
|
{ |
|
increasing = !increasing; |
|
} |
|
|
|
// Compare and sort elements. |
|
for(unsigned int l = k; l < k_plus_half_j; ++l) |
|
{ |
|
if(increasing) |
|
{ |
|
swap_if_first_greater(&array[l], &array[l + half_j]); |
|
} |
|
else |
|
{ |
|
swap_if_first_greater(&array[l + half_j], &array[l]); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
// Parse user input. |
|
cli::Parser parser(argc, argv); |
|
parser.set_optional<unsigned int>("l", |
|
"log2length", |
|
15, |
|
"2**l will be the length of the array to be sorted."); |
|
parser.set_optional<std::string>("s", |
|
"sort", |
|
"inc", |
|
"Sort in decreasing (dec) or increasing (inc) order."); |
|
parser.run_and_exit_if_error(); |
|
|
|
const unsigned int steps = parser.get<unsigned int>("l"); |
|
|
|
const std::string sort = parser.get<std::string>("s"); |
|
if(sort.compare("dec") && sort.compare("inc")) |
|
{ |
|
std::cout << "The ordering must be 'dec' or 'inc', the default ordering is 'inc'." |
|
<< std::endl; |
|
return error_exit_code; |
|
} |
|
const bool sort_increasing = (sort.compare("inc") == 0); |
|
|
|
// Compute length of the array to be sorted. |
|
const unsigned int length = 1u << steps; |
|
|
|
// Allocate and init random host input array. Copy input array for CPU execution. |
|
std::vector<unsigned int> array(length); |
|
std::for_each(array.begin(), array.end(), [](unsigned int& e) { e = rand() % 10; }); |
|
|
|
std::vector<unsigned int> expected_array(array); |
|
|
|
std::cout << "Sorting an array of " << length << " elements using the bitonic sort." |
|
<< std::endl; |
|
|
|
// Declare and allocate device memory and copy input data. |
|
unsigned int* d_array{}; |
|
HIP_CHECK(hipMalloc(&d_array, length * sizeof(unsigned int))); |
|
HIP_CHECK( |
|
hipMemcpy(d_array, array.data(), length * sizeof(unsigned int), hipMemcpyHostToDevice)); |
|
|
|
// Number of threads in each kernel block and number of blocks in the grid. Each thread is in |
|
// charge of 2 elements, so we need enough threads to cover half the length of the array. |
|
const unsigned int local_threads = (length > 256) ? 256 : length / 2; |
|
const unsigned int global_threads = length / 2; |
|
const dim3 block_dim(local_threads); |
|
const dim3 grid_dim(global_threads / local_threads); |
|
|
|
// Create events to measure the execution time of the kernels. |
|
float total_kernels{}; |
|
float kernel_ms{}; |
|
hipEvent_t start, stop; |
|
HIP_CHECK(hipEventCreate(&start)); |
|
HIP_CHECK(hipEventCreate(&stop)); |
|
|
|
// Bitonic sort GPU algorithm: launch bitonic sort kernel for each stage of each step. |
|
for(unsigned int i = 0; i < steps; ++i) |
|
{ |
|
// For each step i we need i + 1 stages. |
|
for(unsigned int j = 0; j <= i; ++j) |
|
{ |
|
// Record the start event. |
|
HIP_CHECK(hipEventRecord(start, hipStreamDefault)); |
|
|
|
// Launch the bitonic sort kernel on the default stream. |
|
bitonic_sort_kernel<<<grid_dim, block_dim, 0 /*shared memory*/, hipStreamDefault>>>( |
|
d_array, |
|
i, |
|
j, |
|
sort_increasing); |
|
|
|
// Check if the kernel launch was successful. |
|
HIP_CHECK(hipGetLastError()); |
|
|
|
// Record the stop event and wait until the kernel execution finishes. |
|
HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); |
|
HIP_CHECK(hipEventSynchronize(stop)); |
|
|
|
// Get the execution time of the kernel and add it to the total count. |
|
HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop)); |
|
total_kernels += kernel_ms; |
|
} |
|
} |
|
|
|
// Copy results back to host. |
|
HIP_CHECK( |
|
hipMemcpy(array.data(), d_array, length * sizeof(unsigned int), hipMemcpyDeviceToHost)); |
|
|
|
// Free events variables and device memory. |
|
HIP_CHECK(hipEventDestroy(start)); |
|
HIP_CHECK(hipEventDestroy(stop)); |
|
HIP_CHECK(hipFree(d_array)); |
|
|
|
// Report execution time. |
|
std::cout << "GPU bitonic sort took " << total_kernels << " milliseconds to complete." |
|
<< std::endl; |
|
|
|
// Execute CPU algorithm. |
|
bitonic_sort_reference(expected_array.data(), length, sort_increasing); |
|
|
|
// Verify results and report to user. |
|
unsigned int errors{}; |
|
std::cout << "Validating results with CPU implementation." << std::endl; |
|
for(unsigned int i = 0; i < length; ++i) |
|
{ |
|
errors += (array[i] - expected_array[i] != 0); |
|
} |
|
report_validation_result(errors); |
|
}
|
|
|