You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

281 lines
12 KiB

// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, includ_adjacency_matrixg without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUd_adjacency_matrixG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "cmdparser.hpp"
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <cassert>
#include <iostream>
#include <numeric>
#include <vector>
/// \brief Implements the k-th (0 <= k < nodes) step of Floyd-Warshall algorithm. That is,
/// given a directed and weighted graph G = (V,E,w) (also complete in this example), it
/// computes the shortest path between every pair of vertices only considering as intermediate
/// nodes in the path the ones in the subset V' = {v_0,v_1,...,v_k} of V.
__global__ void floyd_warshall_kernel(unsigned int* part_adjacency_matrix,
unsigned int* part_next_matrix,
const unsigned int nodes,
const unsigned int k)
{
// Compute the vertices which shortest path each thread is going to process.
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
// Get the current distance between the two vertices (only with intermediate nodes in
// {v_0,v_1,...,v_{k-1}}) and compute the distance using node v_k as intermediate. Note that
// d_x_k_y is the shortest path between x and y with node v_k as intermediate, because
// otherwise we could find a shorter path between y and v_k or/and v_k and x using intermediate
// nodes from {v_0,v_1,...,v_{k-1}} and thus contradicting the fact that the current paths
// between those two pairs of nodes are already the shortest possible.
int d_x_y = part_adjacency_matrix[y * nodes + x];
int d_x_k_y = part_adjacency_matrix[y * nodes + k] + part_adjacency_matrix[k * nodes + x];
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one
// with intermediate node v_k, update matrices so the latter is selected as the
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}.
if(d_x_k_y < d_x_y)
{
part_adjacency_matrix[y * nodes + x] = d_x_k_y;
part_next_matrix[y * nodes + x] = k;
}
}
/// \brief Reference CPU implementation of Floyd-Warshall algorithm for results verification.
void floyd_warshall_reference(unsigned int* adjacency_matrix,
unsigned int* next_matrix,
const unsigned int nodes)
{
for(unsigned int k = 0; k < nodes; k++)
{
for(unsigned int x = 0; x < nodes; x++)
{
const unsigned int row_x = x * nodes;
for(unsigned int y = 0; y < nodes; y++)
{
// d_x_y is the shortest distance from node x to node y with intermediate
// nodes in {v_0, ..., v_{k-1}}. The other two are analogous.
const unsigned int d_x_y = adjacency_matrix[row_x + y];
const unsigned int d_x_k = adjacency_matrix[row_x + k];
const unsigned int d_k_y = adjacency_matrix[k * nodes + y];
// Shortest distance from node x to node y passing through node v_k.
const unsigned int d_x_k_y = d_x_k + d_k_y;
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one
// with intermediate node v_k, update matrices so the latter is selected as the
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}.
if(d_x_k_y < d_x_y)
{
adjacency_matrix[row_x + y] = d_x_k_y;
next_matrix[row_x + y] = k;
}
}
}
}
}
/// \brief Adds to a command line parser the necessary options for this example.
template<unsigned int BlockSize>
void configure_parser(cli::Parser& parser)
{
// Default parameters.
constexpr unsigned int nodes = 16;
constexpr unsigned int iterations = 1;
static_assert(((nodes % BlockSize == 0)),
"Number of nodes must be a positive multiple of BlockSize");
static_assert(((iterations > 0)), "Number of iterations must be at least 1");
// Add options to the command line parser.
parser.set_optional<unsigned int>("n", "nodes", nodes, "Number of nodes in the graph.");
parser.set_optional<unsigned int>("i",
"iterations",
iterations,
"Number of times the algorithm is executed.");
}
int main(int argc, char* argv[])
{
// Number of threads in each kernel block dimension.
constexpr unsigned int block_size = 16;
// Parse user input.
cli::Parser parser(argc, argv);
configure_parser<block_size>(parser);
parser.run_and_exit_if_error();
// Get number of nodes and iterations from the command line, if provided.
const unsigned int nodes = parser.get<unsigned int>("n");
const unsigned int iterations = parser.get<unsigned int>("i");
// Check values provided.
if(nodes % block_size)
{
std::cout << "Number of nodes must be a positive multiple of block_size ("
<< std::to_string(block_size) << ")." << std::endl;
exit(0);
}
if(iterations == 0)
{
std::cout << "Number of iterations must be at least 1." << std::endl;
exit(0);
}
// Total number of elements and bytes of the input matrices.
const unsigned int size = nodes * nodes;
const unsigned int size_bytes = nodes * nodes * sizeof(unsigned int);
// Number of threads in each kernel block and number of blocks in the grid.
const dim3 block_dim(block_size, block_size);
const dim3 grid_dim(nodes / block_size, nodes / block_size);
// Allocate host input adjacency matrix initialized with the increasing sequence 1,2,3,... .
// Overwrite diagonal values (distance from a node to itself) to 0.
std::vector<unsigned int> adjacency_matrix(size);
std::iota(adjacency_matrix.begin(), adjacency_matrix.end(), 1);
for(unsigned int x = 0; x < nodes; x++)
{
adjacency_matrix[x * nodes + x] = 0;
}
// Allocate host input matrix for the reconstruction of the paths obtained and initialize such
// that the path from node x to node y is just the edge (x,y) for any pair of nodes x and y.
std::vector<unsigned int> next_matrix(size);
for(unsigned int x = 0; x < nodes; x++)
{
for(unsigned int y = 0; y < x; y++)
{
next_matrix[x * nodes + y] = x;
next_matrix[y * nodes + x] = y;
}
next_matrix[x * nodes + x] = x;
}
// Allocate host memory for the CPU implementation and copy input data.
std::vector<unsigned int> expected_adjacency_matrix(adjacency_matrix);
std::vector<unsigned int> expected_next_matrix(next_matrix);
// Declare host input (pinned) memory for incremental results from kernel executions.
unsigned int* part_adjacency_matrix = nullptr;
unsigned int* part_next_matrix = nullptr;
// Cumulative variable to compute the mean time per iteration of the algorithm.
double kernel_time = 0;
std::cout << "Executing Floyd-Warshall algorithm for " << iterations
<< " iterations with a complete graph of " << nodes << " nodes." << std::endl;
// Allocate pinned host memory mapped to device memory.
HIP_CHECK(hipHostMalloc(&part_adjacency_matrix, size_bytes, hipHostMallocMapped));
HIP_CHECK(hipHostMalloc(&part_next_matrix, size_bytes, hipHostMallocMapped));
// Get device pointer to pinned host memory allocations for the input matrices.
float *d_adjacency_matrix, *d_next_matrix;
HIP_CHECK(
hipHostGetDevicePointer((void**)&d_adjacency_matrix, part_adjacency_matrix, 0 /*flags*/));
HIP_CHECK(hipHostGetDevicePointer((void**)&d_next_matrix, part_next_matrix, 0 /*flags*/));
// Run iterations times the Floyd-Warshall GPU algorithm.
for(unsigned int i = 0; i < iterations; ++i)
{
// Copy input data from host to device memory.
HIP_CHECK(hipMemcpy(d_adjacency_matrix,
adjacency_matrix.data(),
size_bytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_next_matrix, next_matrix.data(), size_bytes, hipMemcpyHostToDevice));
// Create events to measure the execution time of the kernels.
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
float kernel_ms{};
// Floyd-Warshall GPU algorithm: launch Floyd-Warshall kernel for each node of the graph.
for(unsigned int k = 0; k < nodes; ++k)
{
// Record the start event.
HIP_CHECK(hipEventRecord(start, hipStreamDefault));
// Launch Floyd-Warshall kernel on the default stream.
hipLaunchKernelGGL(floyd_warshall_kernel,
grid_dim,
block_dim,
0,
hipStreamDefault,
part_adjacency_matrix,
part_next_matrix,
nodes,
k);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Record the stop event and wait until the kernel execution finishes.
HIP_CHECK(hipEventRecord(stop, hipStreamDefault));
HIP_CHECK(hipEventSynchronize(stop));
// Get the execution time of the kernel and add it to the total count.
HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop));
kernel_time += kernel_ms;
}
}
// Copy results back to host.
HIP_CHECK(
hipMemcpy(adjacency_matrix.data(), d_adjacency_matrix, size_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(next_matrix.data(), d_next_matrix, size_bytes, hipMemcpyDeviceToHost));
// Free device memory.
HIP_CHECK(hipHostFree(part_adjacency_matrix));
HIP_CHECK(hipHostFree(part_next_matrix));
// Print the mean time per iteration (in miliseconds) of the algorithm.
kernel_time /= iterations;
std::cout << "The mean time needed for each iteration has been " << kernel_time << "ms."
<< std::endl;
// Execute CPU algorithm.
floyd_warshall_reference(expected_adjacency_matrix.data(), expected_next_matrix.data(), nodes);
// Verify results.
unsigned int errors = 0;
std::cout << "Validating results with CPU implementation." << std::endl;
for(unsigned int i = 0; i < size; ++i)
{
errors += (adjacency_matrix[i] - expected_adjacency_matrix[i] != 0);
errors += (next_matrix[i] - expected_next_matrix[i] != 0);
}
if(errors)
{
std::cout << "Validation failed with " << errors << " errors." << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}