You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
281 lines
12 KiB
281 lines
12 KiB
// MIT License |
|
// |
|
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
|
// |
|
// Permission is hereby granted, free of charge, to any person obtaining a copy |
|
// of this software and associated documentation files (the "Software"), to deal |
|
// in the Software without restriction, includ_adjacency_matrixg without limitation the rights |
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
// copies of the Software, and to permit persons to whom the Software is |
|
// furnished to do so, subject to the following conditions: |
|
// |
|
// The above copyright notice and this permission notice shall be included in all |
|
// copies or substantial portions of the Software. |
|
// |
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
// IMPLIED, INCLUd_adjacency_matrixG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
// SOFTWARE. |
|
|
|
#include "cmdparser.hpp" |
|
#include "example_utils.hpp" |
|
|
|
#include <hip/hip_runtime.h> |
|
|
|
#include <cassert> |
|
#include <iostream> |
|
#include <numeric> |
|
#include <vector> |
|
|
|
/// \brief Implements the k-th (0 <= k < nodes) step of Floyd-Warshall algorithm. That is, |
|
/// given a directed and weighted graph G = (V,E,w) (also complete in this example), it |
|
/// computes the shortest path between every pair of vertices only considering as intermediate |
|
/// nodes in the path the ones in the subset V' = {v_0,v_1,...,v_k} of V. |
|
__global__ void floyd_warshall_kernel(unsigned int* part_adjacency_matrix, |
|
unsigned int* part_next_matrix, |
|
const unsigned int nodes, |
|
const unsigned int k) |
|
{ |
|
// Compute the vertices which shortest path each thread is going to process. |
|
int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
// Get the current distance between the two vertices (only with intermediate nodes in |
|
// {v_0,v_1,...,v_{k-1}}) and compute the distance using node v_k as intermediate. Note that |
|
// d_x_k_y is the shortest path between x and y with node v_k as intermediate, because |
|
// otherwise we could find a shorter path between y and v_k or/and v_k and x using intermediate |
|
// nodes from {v_0,v_1,...,v_{k-1}} and thus contradicting the fact that the current paths |
|
// between those two pairs of nodes are already the shortest possible. |
|
int d_x_y = part_adjacency_matrix[y * nodes + x]; |
|
int d_x_k_y = part_adjacency_matrix[y * nodes + k] + part_adjacency_matrix[k * nodes + x]; |
|
|
|
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one |
|
// with intermediate node v_k, update matrices so the latter is selected as the |
|
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}. |
|
if(d_x_k_y < d_x_y) |
|
{ |
|
part_adjacency_matrix[y * nodes + x] = d_x_k_y; |
|
part_next_matrix[y * nodes + x] = k; |
|
} |
|
} |
|
|
|
/// \brief Reference CPU implementation of Floyd-Warshall algorithm for results verification. |
|
void floyd_warshall_reference(unsigned int* adjacency_matrix, |
|
unsigned int* next_matrix, |
|
const unsigned int nodes) |
|
{ |
|
for(unsigned int k = 0; k < nodes; k++) |
|
{ |
|
for(unsigned int x = 0; x < nodes; x++) |
|
{ |
|
const unsigned int row_x = x * nodes; |
|
for(unsigned int y = 0; y < nodes; y++) |
|
{ |
|
// d_x_y is the shortest distance from node x to node y with intermediate |
|
// nodes in {v_0, ..., v_{k-1}}. The other two are analogous. |
|
const unsigned int d_x_y = adjacency_matrix[row_x + y]; |
|
const unsigned int d_x_k = adjacency_matrix[row_x + k]; |
|
const unsigned int d_k_y = adjacency_matrix[k * nodes + y]; |
|
|
|
// Shortest distance from node x to node y passing through node v_k. |
|
const unsigned int d_x_k_y = d_x_k + d_k_y; |
|
|
|
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one |
|
// with intermediate node v_k, update matrices so the latter is selected as the |
|
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}. |
|
if(d_x_k_y < d_x_y) |
|
{ |
|
adjacency_matrix[row_x + y] = d_x_k_y; |
|
next_matrix[row_x + y] = k; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
/// \brief Adds to a command line parser the necessary options for this example. |
|
template<unsigned int BlockSize> |
|
void configure_parser(cli::Parser& parser) |
|
{ |
|
// Default parameters. |
|
constexpr unsigned int nodes = 16; |
|
constexpr unsigned int iterations = 1; |
|
|
|
static_assert(((nodes % BlockSize == 0)), |
|
"Number of nodes must be a positive multiple of BlockSize"); |
|
static_assert(((iterations > 0)), "Number of iterations must be at least 1"); |
|
|
|
// Add options to the command line parser. |
|
parser.set_optional<unsigned int>("n", "nodes", nodes, "Number of nodes in the graph."); |
|
parser.set_optional<unsigned int>("i", |
|
"iterations", |
|
iterations, |
|
"Number of times the algorithm is executed."); |
|
} |
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
// Number of threads in each kernel block dimension. |
|
constexpr unsigned int block_size = 16; |
|
|
|
// Parse user input. |
|
cli::Parser parser(argc, argv); |
|
configure_parser<block_size>(parser); |
|
parser.run_and_exit_if_error(); |
|
|
|
// Get number of nodes and iterations from the command line, if provided. |
|
const unsigned int nodes = parser.get<unsigned int>("n"); |
|
const unsigned int iterations = parser.get<unsigned int>("i"); |
|
|
|
// Check values provided. |
|
if(nodes % block_size) |
|
{ |
|
std::cout << "Number of nodes must be a positive multiple of block_size (" |
|
<< std::to_string(block_size) << ")." << std::endl; |
|
exit(0); |
|
} |
|
if(iterations == 0) |
|
{ |
|
std::cout << "Number of iterations must be at least 1." << std::endl; |
|
exit(0); |
|
} |
|
|
|
// Total number of elements and bytes of the input matrices. |
|
const unsigned int size = nodes * nodes; |
|
const unsigned int size_bytes = nodes * nodes * sizeof(unsigned int); |
|
|
|
// Number of threads in each kernel block and number of blocks in the grid. |
|
const dim3 block_dim(block_size, block_size); |
|
const dim3 grid_dim(nodes / block_size, nodes / block_size); |
|
|
|
// Allocate host input adjacency matrix initialized with the increasing sequence 1,2,3,... . |
|
// Overwrite diagonal values (distance from a node to itself) to 0. |
|
std::vector<unsigned int> adjacency_matrix(size); |
|
std::iota(adjacency_matrix.begin(), adjacency_matrix.end(), 1); |
|
for(unsigned int x = 0; x < nodes; x++) |
|
{ |
|
adjacency_matrix[x * nodes + x] = 0; |
|
} |
|
|
|
// Allocate host input matrix for the reconstruction of the paths obtained and initialize such |
|
// that the path from node x to node y is just the edge (x,y) for any pair of nodes x and y. |
|
std::vector<unsigned int> next_matrix(size); |
|
for(unsigned int x = 0; x < nodes; x++) |
|
{ |
|
for(unsigned int y = 0; y < x; y++) |
|
{ |
|
next_matrix[x * nodes + y] = x; |
|
next_matrix[y * nodes + x] = y; |
|
} |
|
next_matrix[x * nodes + x] = x; |
|
} |
|
|
|
// Allocate host memory for the CPU implementation and copy input data. |
|
std::vector<unsigned int> expected_adjacency_matrix(adjacency_matrix); |
|
std::vector<unsigned int> expected_next_matrix(next_matrix); |
|
|
|
// Declare host input (pinned) memory for incremental results from kernel executions. |
|
unsigned int* part_adjacency_matrix = nullptr; |
|
unsigned int* part_next_matrix = nullptr; |
|
|
|
// Cumulative variable to compute the mean time per iteration of the algorithm. |
|
double kernel_time = 0; |
|
|
|
std::cout << "Executing Floyd-Warshall algorithm for " << iterations |
|
<< " iterations with a complete graph of " << nodes << " nodes." << std::endl; |
|
|
|
// Allocate pinned host memory mapped to device memory. |
|
HIP_CHECK(hipHostMalloc(&part_adjacency_matrix, size_bytes, hipHostMallocMapped)); |
|
HIP_CHECK(hipHostMalloc(&part_next_matrix, size_bytes, hipHostMallocMapped)); |
|
|
|
// Get device pointer to pinned host memory allocations for the input matrices. |
|
float *d_adjacency_matrix, *d_next_matrix; |
|
HIP_CHECK( |
|
hipHostGetDevicePointer((void**)&d_adjacency_matrix, part_adjacency_matrix, 0 /*flags*/)); |
|
HIP_CHECK(hipHostGetDevicePointer((void**)&d_next_matrix, part_next_matrix, 0 /*flags*/)); |
|
|
|
// Run iterations times the Floyd-Warshall GPU algorithm. |
|
for(unsigned int i = 0; i < iterations; ++i) |
|
{ |
|
// Copy input data from host to device memory. |
|
HIP_CHECK(hipMemcpy(d_adjacency_matrix, |
|
adjacency_matrix.data(), |
|
size_bytes, |
|
hipMemcpyHostToDevice)); |
|
HIP_CHECK(hipMemcpy(d_next_matrix, next_matrix.data(), size_bytes, hipMemcpyHostToDevice)); |
|
|
|
// Create events to measure the execution time of the kernels. |
|
hipEvent_t start, stop; |
|
HIP_CHECK(hipEventCreate(&start)); |
|
HIP_CHECK(hipEventCreate(&stop)); |
|
float kernel_ms{}; |
|
|
|
// Floyd-Warshall GPU algorithm: launch Floyd-Warshall kernel for each node of the graph. |
|
for(unsigned int k = 0; k < nodes; ++k) |
|
{ |
|
// Record the start event. |
|
HIP_CHECK(hipEventRecord(start, hipStreamDefault)); |
|
|
|
// Launch Floyd-Warshall kernel on the default stream. |
|
hipLaunchKernelGGL(floyd_warshall_kernel, |
|
grid_dim, |
|
block_dim, |
|
0, |
|
hipStreamDefault, |
|
part_adjacency_matrix, |
|
part_next_matrix, |
|
nodes, |
|
k); |
|
|
|
// Check if the kernel launch was successful. |
|
HIP_CHECK(hipGetLastError()); |
|
|
|
// Record the stop event and wait until the kernel execution finishes. |
|
HIP_CHECK(hipEventRecord(stop, hipStreamDefault)); |
|
HIP_CHECK(hipEventSynchronize(stop)); |
|
|
|
// Get the execution time of the kernel and add it to the total count. |
|
HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop)); |
|
kernel_time += kernel_ms; |
|
} |
|
} |
|
|
|
// Copy results back to host. |
|
HIP_CHECK( |
|
hipMemcpy(adjacency_matrix.data(), d_adjacency_matrix, size_bytes, hipMemcpyDeviceToHost)); |
|
HIP_CHECK(hipMemcpy(next_matrix.data(), d_next_matrix, size_bytes, hipMemcpyDeviceToHost)); |
|
|
|
// Free device memory. |
|
HIP_CHECK(hipHostFree(part_adjacency_matrix)); |
|
HIP_CHECK(hipHostFree(part_next_matrix)); |
|
|
|
// Print the mean time per iteration (in miliseconds) of the algorithm. |
|
kernel_time /= iterations; |
|
std::cout << "The mean time needed for each iteration has been " << kernel_time << "ms." |
|
<< std::endl; |
|
|
|
// Execute CPU algorithm. |
|
floyd_warshall_reference(expected_adjacency_matrix.data(), expected_next_matrix.data(), nodes); |
|
|
|
// Verify results. |
|
unsigned int errors = 0; |
|
std::cout << "Validating results with CPU implementation." << std::endl; |
|
for(unsigned int i = 0; i < size; ++i) |
|
{ |
|
errors += (adjacency_matrix[i] - expected_adjacency_matrix[i] != 0); |
|
errors += (next_matrix[i] - expected_next_matrix[i] != 0); |
|
} |
|
|
|
if(errors) |
|
{ |
|
std::cout << "Validation failed with " << errors << " errors." << std::endl; |
|
return error_exit_code; |
|
} |
|
else |
|
{ |
|
std::cout << "Validation passed." << std::endl; |
|
} |
|
}
|
|
|