Develop Stream 2024-03-21 general fixes (part II) (#98)

* Re-enable the hello-world example on Windows with CMake * Update ROCm to 5.7 * Small changes to make the pipeline pass with rocm 5.7 on Nvidia platform * Re-enable the rocsparse tests for cmake on windows * Resolve "windows test timeouts" * Resolve "hip_multi_gpu_data_transfer fails validation." * Remove unused includes * Resolve "NVCC VS 2019 Release runtime failure" * Resolve "NVCC: hip_runtime_compilation failure" * Resolve "Update install instruction to reflect optional dependencies of certain libraries" * Resolve "HIP texture management example not working on Windows" * Print version * Install ROCm on nvidia via rocm-core and hipcc * Resolve "Use ceiling_div wherever possible" * Resolve "Investigate and replace setprecision with a better solution in examples" * Disable execution of failing examples in VS * fix(rocsparse_utils): remove rocSPARSE 3.0 status --------- Co-authored-by: Gergely Mészáros <gergely@streamhpc.com> Co-authored-by: Mátyás Aradi <matyas@streamhpc.com> Co-authored-by: Balint Soproni <balint@streamhpc.com> Co-authored-by: Bence Parajdi <bence@streamhpc.com> Co-authored-by: Robin Voetter <robin@streamhpc.com> Co-authored-by: Jaap Blok <jaap@streamhpc.com> Co-authored-by: Nick Breed <nick@streamhpc.com>
1 year ago · d0ce9531cb
28 changed files with 257 additions and 192 deletions
--- a/.clang-format
+++ b/.clang-format
@ -58,8 +58,6 @@ BraceWrapping:
				@@ -58,8 +58,6 @@ BraceWrapping:
    AfterNamespace: true
    AfterStruct: true
    AfterUnion: true
-    BeforeCatch: true
-    BeforeElse: true
    AfterExternBlock: false
    BeforeCatch: true
    BeforeElse: true
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -239,9 +239,11 @@ test:cuda:
				@@ -239,9 +239,11 @@ test:cuda:
          - Debug
          - Release
  variables:
-    Timeout: 30
+    Timeout: 60
    Filter: "*_vs$VS_VERSION.exe"
  script:
+    - |
+      & ${env:HIP_PATH}/bin/clang++ --version
    - | # Find MSBuild.exe of the associated version.
      $MSBUILD = (
        & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -find MSBuild\**\Bin\MSBuild.exe
@ -271,11 +273,7 @@ test:cuda:
				@@ -271,11 +273,7 @@ test:cuda:
        "$CI_PROJECT_DIR\$SOLUTION_PREFIX$VS_VERSION.sln"
      )
    - | # Use external script to test examples
-      if (!$SKIP_TESTS) {
-        & $CI_PROJECT_DIR\Scripts\WindowsRunner.ps1 $CI_PROJECT_DIR\$BUILD_TYPE $Filter $Timeout $("$SkippedExamples".split(','))
-      } else {
-        Write-Output "Tests skipped!"
-      }
+      & $CI_PROJECT_DIR\Scripts\WindowsRunner.ps1 $CI_PROJECT_DIR\$BUILD_TYPE $Filter $Timeout $("$SkippedExamples".split(','))

 test:windows-rocm-vs:
  extends:
@ -287,13 +285,20 @@ test:windows-rocm-vs:
				@@ -287,13 +285,20 @@ test:windows-rocm-vs:
    - rx6900
  variables:
    SOLUTION_PREFIX: ROCm-Examples-VS
-    # hip_vulkant_interop: graphical
-    # hip_texture_management: does not work
-    # rocsparse_*: broken with new SDK
+    # hip_vulkan_interop: graphical
+    # applications_monte_carlo_pi: broken with new SDK (5.7 v66)
+    # hipfft/rocfft_*: broken with new SDK (5.7 v66)
    SkippedExamples: >
      hip_vulkan_interop_*.exe,
-      hip_texture_management_*.exe,
-
+      applications_monte_carlo_pi_*.exe,
+      hipfft_plan_d2z_*.exe,
+      hipfft_plan_z2z_*.exe,
+      hipfft_plan_many_2d_r2c_*.exe,
+      hipfft_plan_many_2d_z2z_*.exe,
+      hipfft_setworkarea_*.exe,
+      rocfft_complex_complex_*.exe,
+      rocfft_complex_real_*.exe,
+      rocfft_real_complex_*.exe
 test:windows-nvcc-vs:
  extends:
    - .test:windows-nvcc
@ -302,12 +307,10 @@ test:windows-nvcc-vs:
				@@ -302,12 +307,10 @@ test:windows-nvcc-vs:
    - nvcc-windows
  variables:
    SOLUTION_PREFIX: ROCm-Examples-Portable-VS
-    # hip_runtime_compilation: fails on VS2017
+    # NVCC examples broken with new SDK (5.7 v66)
    SkippedExamples: >
-      hip_runtime_compilation_vs2017.exe
+      *.exe
  before_script:
-    - | # Release builds are currently broken!
-      $SKIP_TESTS = ($BUILD_TYPE -eq "Release")
    # To test for NVIDIA, we need to set the platform toolset to HIP_nvcc. This cannot be done with /p:PlatformToolset
    # though, as some examples use the regular msvc toolchain.
    - |
@ -347,7 +350,7 @@ test:windows-nvcc-vs:
				@@ -347,7 +350,7 @@ test:windows-nvcc-vs:
    # So for now, just add the library path here.
    - $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH
    - cd "$CI_PROJECT_DIR/build"
-    - ctest --output-on-failure --timeout 15 -E "rocsparse_bsrsv|rocsparse_csrsv|rocsparse_spsv|rocsparse_bsrsm|rocsparse_csrsm|rocsparse_bsric0|rocsparse_bsrilu0|rocsparse_csric0|rocsparse_csrilu0"
+    - ctest --output-on-failure --timeout 15
    - cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
  needs: []

--- a/Applications/prefix_sum/main.hip
+++ b/Applications/prefix_sum/main.hip
@ -123,11 +123,11 @@ __global__ void device_prefix_sum(float* buffer, int size, int offset)
				@@ -123,11 +123,11 @@ __global__ void device_prefix_sum(float* buffer, int size, int offset)
 void run_prefix_sum_kernels(float* input, float* output, const int size)
 {
    // 4.1 Define kernel constants
-    constexpr int threads_per_block = 128;
-    dim3          block_dim(threads_per_block);
+    constexpr unsigned int threads_per_block = 128;
+    dim3                   block_dim(threads_per_block);

    // Each thread works on 2 elements.
-    constexpr int items_per_block = threads_per_block * 2;
+    constexpr unsigned int items_per_block = threads_per_block * 2;
    // block_prefix_sum uses shared memory dependent on the amount of threads per block.
    constexpr size_t shared_size = sizeof(float) * 2 * threads_per_block;

@ -142,13 +142,12 @@ void run_prefix_sum_kernels(float* input, float* output, const int size)
				@@ -142,13 +142,12 @@ void run_prefix_sum_kernels(float* input, float* output, const int size)
    // Alternatively, use hipcub::DeviceScan::ExclusiveScan
    for(int offset = 1; offset < size; offset *= items_per_block)
    {
-        const int data_size = size / offset;
+        const unsigned int data_size = size / offset;

        if(size / offset > 1)
        {
            unsigned int total_threads = (data_size + 1) / 2;
-            total_threads
-                = ((total_threads + threads_per_block - 1) / threads_per_block) * threads_per_block;
+            total_threads = ceiling_div(total_threads, threads_per_block) * threads_per_block;
            dim3 grid_dim(total_threads / threads_per_block);

            block_prefix_sum<<<grid_dim, block_dim, shared_size>>>(d_data, size, offset);
@ -156,10 +155,9 @@ void run_prefix_sum_kernels(float* input, float* output, const int size)
				@@ -156,10 +155,9 @@ void run_prefix_sum_kernels(float* input, float* output, const int size)

        if(offset > 1)
        {
-            int total_threads = size - offset;
+            unsigned int total_threads = size - offset;
            total_threads -= (total_threads / (offset * items_per_block)) * offset;
-            total_threads
-                = ((total_threads + threads_per_block - 1) / threads_per_block) * threads_per_block;
+            total_threads = ceiling_div(total_threads, threads_per_block) * threads_per_block;
            dim3 grid_dim(total_threads / threads_per_block);

            device_prefix_sum<<<grid_dim, block_dim>>>(d_data, size, offset);
--- a/Common/example_utils.hpp
+++ b/Common/example_utils.hpp
@ -24,6 +24,7 @@
				@@ -24,6 +24,7 @@
 #define COMMON_EXAMPLE_UTILS_HPP

 // Compiling HIP on Windows includes windows.h, and this triggers many silly warnings.
+#include <cstdint>
 #if defined(_WIN32) && defined(__NVCC__)
    #pragma nv_diag_suppress 108 // signed bit field of length 1
    #pragma nv_diag_suppress 174 // expression has no effect
@ -35,13 +36,16 @@
				@@ -35,13 +36,16 @@
    #pragma clang diagnostic ignored "-W#warnings"
 #endif

+#include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <iomanip>
 #include <iostream>
 #include <iterator>
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <vector>

 #include <hip/hip_runtime.h>

@ -177,7 +181,7 @@ public:
				@@ -177,7 +181,7 @@ public:
 template<typename T,
         typename U,
         std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<U>::value, int> = 0>
-__host__ __device__ auto ceiling_div(const T& dividend, const U& divisor)
+__host__ __device__ constexpr auto ceiling_div(const T& dividend, const U& divisor)
 {
    return (dividend + divisor - 1) / divisor;
 }
@ -240,4 +244,17 @@ void multiply_matrices(T        alpha,
				@@ -240,4 +244,17 @@ void multiply_matrices(T        alpha,
    }
 }

+/// \brief Returns a string from the double \p value with specified \p precision .
+inline std::string
+    double_precision(const double value, const int precision, const bool fixed = false)
+{
+    std::stringstream ss;
+    if(fixed)
+    {
+        ss << std::fixed;
+    }
+    ss << std::setprecision(precision) << value;
+    return ss.str();
+}
+
 #endif // COMMON_EXAMPLE_UTILS_HPP
--- a/Common/rocsparse_utils.hpp
+++ b/Common/rocsparse_utils.hpp
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@ -47,7 +47,6 @@ inline const char* rocsparse_status_to_string(rocsparse_status status)
				@@ -47,7 +47,6 @@ inline const char* rocsparse_status_to_string(rocsparse_status status)
        case rocsparse_status_not_initialized: return "rocsparse_status_not_initialized";
        case rocsparse_status_type_mismatch: return "rocsparse_status_type_mismatch";
        case rocsparse_status_thrown_exception: return "rocsparse_status_thrown_exception";
-        case rocsparse_status_continue: return "rocsparse_status_continue";
        case rocsparse_status_requires_sorted_storage:
            return "rocsparse_status_requires_sorted_storage";
    }
--- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
@ -27,11 +27,9 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
				@@ -27,11 +27,9 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
 # Install HIP using the installer script
 RUN export DEBIAN_FRONTEND=noninteractive; \
    wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.4/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
+    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.7/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
    && apt-get update -qq \
-    && apt-get install -y hip-base hipify-clang \
-    && apt-get download hip-runtime-nvidia hip-dev \
-    && dpkg -i --ignore-depends=cuda hip*
+    && apt-get install -y hip-base hipify-clang rocm-core hipcc hip-dev

 # Install CMake
 RUN wget https://github.com/Kitware/CMake/releases/download/v3.21.7/cmake-3.21.7-linux-x86_64.sh \
@ -41,62 +39,64 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.21.7/cmake-3.21.7
				@@ -41,62 +39,64 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.21.7/cmake-3.21.7

 ENV PATH="/cmake/bin:/opt/rocm/bin:${PATH}"

+ENV HIP_COMPILER=nvcc HIP_PLATFORM=nvidia HIP_RUNTIME=cuda
+
 RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
    && ldconfig

 # Install rocRAND
-RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
-    && tar -xf ./rocm-5.4.0.tar.gz \
-    && rm ./rocm-5.4.0.tar.gz \
-    && cmake -S ./rocRAND-rocm-5.4.0 -B ./rocRAND-rocm-5.4.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.7.0.tar.gz \
+    && tar -xf ./rocm-5.7.0.tar.gz \
+    && rm ./rocm-5.7.0.tar.gz \
+    && cmake -S ./rocRAND-rocm-5.7.0 -B ./rocRAND-rocm-5.7.0/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -D BUILD_HIPRAND=OFF \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./rocRAND-rocm-5.4.0/build --target install \
-    && rm -rf ./rocRAND-rocm-5.4.0
+    && cmake --build ./rocRAND-rocm-5.7.0/build --target install \
+    && rm -rf ./rocRAND-rocm-5.7.0

 # Install hipCUB
-RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.4.0.tar.gz \
-    && tar -xf ./rocm-5.4.0.tar.gz \
-    && rm ./rocm-5.4.0.tar.gz \
-    && cmake -S ./hipCUB-rocm-5.4.0 -B ./hipCUB-rocm-5.4.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.7.0.tar.gz \
+    && tar -xf ./rocm-5.7.0.tar.gz \
+    && rm ./rocm-5.7.0.tar.gz \
+    && cmake -S ./hipCUB-rocm-5.7.0 -B ./hipCUB-rocm-5.7.0/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./hipCUB-rocm-5.4.0/build --target install \
-    && rm -rf ./hipCUB-rocm-5.4.0
+    && cmake --build ./hipCUB-rocm-5.7.0/build --target install \
+    && rm -rf ./hipCUB-rocm-5.7.0

 # Install hipBLAS
-RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.4.0.tar.gz \
-    && tar -xf ./rocm-5.4.0.tar.gz \
-    && rm ./rocm-5.4.0.tar.gz \
-    && cmake -S ./hipBLAS-rocm-5.4.0 -B ./hipBLAS-rocm-5.4.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.7.0.tar.gz \
+    && tar -xf ./rocm-5.7.0.tar.gz \
+    && rm ./rocm-5.7.0.tar.gz \
+    && cmake -S ./hipBLAS-rocm-5.7.0 -B ./hipBLAS-rocm-5.7.0/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
        -D USE_CUDA=ON \
-    && cmake --build ./hipBLAS-rocm-5.4.0/build --target install \
-    && rm -rf ./hipBLAS-rocm-5.4.0
+    && cmake --build ./hipBLAS-rocm-5.7.0/build --target install \
+    && rm -rf ./hipBLAS-rocm-5.7.0

 # Install hipSOLVER
-RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.4.0.tar.gz \
-    && tar -xf ./rocm-5.4.0.tar.gz \
-    && rm ./rocm-5.4.0.tar.gz \
-    && cmake -S ./hipSOLVER-rocm-5.4.0 -B ./hipSOLVER-rocm-5.4.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.7.0.tar.gz \
+    && tar -xf ./rocm-5.7.0.tar.gz \
+    && rm ./rocm-5.7.0.tar.gz \
+    && cmake -S ./hipSOLVER-rocm-5.7.0 -B ./hipSOLVER-rocm-5.7.0/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
        -D USE_CUDA=ON \
-    && cmake --build ./hipSOLVER-rocm-5.4.0/build --target install \
-    && rm -rf ./hipSOLVER-rocm-5.4.0
+    && cmake --build ./hipSOLVER-rocm-5.7.0/build --target install \
+    && rm -rf ./hipSOLVER-rocm-5.7.0

 # Install hipRAND
-RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
-    && tar -xf ./rocm-5.4.0.tar.gz \
-    && rm ./rocm-5.4.0.tar.gz \
-    && cmake -S ./hipRAND-rocm-5.4.0 -B ./hipRAND-rocm-5.4.0/build \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.7.0.tar.gz \
+    && tar -xf ./rocm-5.7.0.tar.gz \
+    && rm ./rocm-5.7.0.tar.gz \
+    && cmake -S ./hipRAND-rocm-5.7.0 -B ./hipRAND-rocm-5.7.0/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
        -D BUILD_WITH_LIB=CUDA \
-    && cmake --build ./hipRAND-rocm-5.4.0/build --target install \
-    && rm -rf ./hipRAND-rocm-5.4.0
+    && cmake --build ./hipRAND-rocm-5.7.0/build --target install \
+    && rm -rf ./hipRAND-rocm-5.7.0

 # Use render group as an argument from user
 ARG GID=109
--- a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
@ -24,10 +24,10 @@ ENV LANG en_US.utf8
				@@ -24,10 +24,10 @@ ENV LANG en_US.utf8

 # Install ROCM HIP and libraries using the installer script
 RUN export DEBIAN_FRONTEND=noninteractive; \
-    wget https://repo.radeon.com/amdgpu-install/5.7.1/ubuntu/focal/amdgpu-install_5.7.50701-1_all.deb \
+    wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb \
    && apt-get update -qq \
-    && apt-get install -y ./amdgpu-install_5.7.50701-1_all.deb \
-    && rm ./amdgpu-install_5.7.50701-1_all.deb \
+    && apt-get install -y ./amdgpu-install_5.7.50700-1_all.deb \
+    && rm ./amdgpu-install_5.7.50700-1_all.deb \
    && amdgpu-install -y --usecase=hiplibsdk --no-dkms \
    && apt-get install -y libnuma-dev \
    && rm -rf /var/lib/apt/lists/*
--- a/HIP-Basic/CMakeLists.txt
+++ b/HIP-Basic/CMakeLists.txt
@ -41,9 +41,8 @@ add_subdirectory(device_globals)
				@@ -41,9 +41,8 @@ add_subdirectory(device_globals)
 add_subdirectory(dynamic_shared)
 add_subdirectory(events)
 add_subdirectory(gpu_arch)
+add_subdirectory(hello_world)
 if(NOT WIN32)
-    add_subdirectory(hello_world)
-
    find_package(Perl)

    if(Perl_FOUND)
@ -65,10 +64,7 @@ if(NOT WIN32 AND NOT "${GPU_RUNTIME}" STREQUAL "HIP")
				@@ -65,10 +64,7 @@ if(NOT WIN32 AND NOT "${GPU_RUNTIME}" STREQUAL "HIP")
    add_subdirectory(static_host_library)
 endif()
 add_subdirectory(streams)
-# temporarily exclude texture management on Windows
-if(NOT WIN32)
-    add_subdirectory(texture_management)
-endif()
+add_subdirectory(texture_management)
 add_subdirectory(warp_shuffle)

 find_package(glfw3)
--- a/HIP-Basic/device_globals/main.hip
+++ b/HIP-Basic/device_globals/main.hip
@ -75,7 +75,7 @@ int main()
				@@ -75,7 +75,7 @@ int main()
    constexpr unsigned int block_size = size;

    // Number of blocks per kernel grid. The expression below calculates ceil(size/block_size).
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // Allocate host vectors for the input and output.
    std::vector<float> h_in(size);
--- a/HIP-Basic/device_query/main.cpp
+++ b/HIP-Basic/device_query/main.cpp
@ -65,8 +65,9 @@ void print_device_properties(int device_id)
				@@ -65,8 +65,9 @@ void print_device_properties(int device_id)
    // Print a small set of all available properties. A full list can be found at:
    // https://docs.amd.com/bundle/HIP_API_Guide/page/structhip_device_prop__t.html
    std::cout << std::setw(col_w) << "Name: " << props.name << '\n';
-    std::cout << std::setw(col_w) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
-              << bytes_to_gib(props.totalGlobalMem) << " GiB\n";
+    std::cout << std::setw(col_w)
+              << "totalGlobalMem: " << double_precision(bytes_to_gib(props.totalGlobalMem), 2, true)
+              << " GiB\n";
    std::cout << std::setw(col_w) << "sharedMemPerBlock: " << bytes_to_kib(props.sharedMemPerBlock)
              << " KiB\n";
    std::cout << std::setw(col_w) << "regsPerBlock: " << props.regsPerBlock << '\n';
@ -133,10 +134,11 @@ void print_device_properties(int device_id)
				@@ -133,10 +134,11 @@ void print_device_properties(int device_id)
    size_t free, total;
    HIP_CHECK(hipMemGetInfo(&free, &total));

-    std::cout << std::fixed << std::setprecision(2);
-    std::cout << std::setw(col_w) << "memInfo.total: " << bytes_to_gib(total) << " GiB\n";
-    std::cout << std::setw(col_w) << "memInfo.free:  " << bytes_to_gib(free) << " GiB ("
-              << std::setprecision(0) << static_cast<double>(free) / total * 100.0 << "%)\n";
+    std::cout << std::setw(col_w)
+              << "memInfo.total: " << double_precision(bytes_to_gib(total), 2, true) << " GiB\n";
+    std::cout << std::setw(col_w)
+              << "memInfo.free:  " << double_precision(bytes_to_gib(free), 2, true) << " GiB ("
+              << double_precision(static_cast<double>(free) / total * 100.0, 0, true) << "%)\n";
 }
 } // namespace

--- a/HIP-Basic/hello_world/CMakeLists.txt
+++ b/HIP-Basic/hello_world/CMakeLists.txt
@ -22,10 +22,6 @@
				@@ -22,10 +22,6 @@

 set(example_name hip_hello_world)

-if(WIN32)
-    message(FATAL_ERROR "The hello world example currently does not support Windows.")
-endif()
-
 cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 project(${example_name} LANGUAGES CXX)

--- a/HIP-Basic/module_api/main.hip
+++ b/HIP-Basic/module_api/main.hip
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@ -45,7 +45,7 @@ int main(int, char* argv[])
				@@ -45,7 +45,7 @@ int main(int, char* argv[])
    constexpr unsigned int block_size = size;

    // Number of blocks per kernel grid. The expression below calculates ceil(size/block_size).
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // Allocate host vectors for the input and output.
    std::vector<float> h_in(size);
--- a/HIP-Basic/moving_average/main.hip
+++ b/HIP-Basic/moving_average/main.hip
@ -93,7 +93,7 @@ int main()
				@@ -93,7 +93,7 @@ int main()
    constexpr unsigned int block_size = 256;

    // Number of blocks per kernel grid.
-    constexpr unsigned int grid_size = (output_size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(output_size, block_size);

    // Allocate and initialize input data on the host.
    std::vector<unsigned int> h_input(input_size);
--- a/HIP-Basic/multi_gpu_data_transfer/README.md
+++ b/HIP-Basic/multi_gpu_data_transfer/README.md
@ -29,6 +29,7 @@ In this example, the result of a matrix transpose kernel execution on one device
				@@ -29,6 +29,7 @@ In this example, the result of a matrix transpose kernel execution on one device
 - With `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`), from device to host (using `hipMemcpyDeviceToHost`) or from device to device (using `hipMemcpyDeviceToDevice`). The latter will only work if P2P communication has been enabled from the destination to the source device.
 - `myKernelName<<<...>>>` queues the execution of a kernel in the current device and `hipDeviceSynchronize` makes the host to wait on all active streams on the current device. In this example `hipDeviceSynchronize` is necessary because the second device needs the results obtained from the previous kernel execution on the first device.
 - `hipDeviceReset` discards the state of the current device and updates it to fresh one. It also frees all the resources (e.g. streams, events, ...) associated with the current device.
+- It's a [known issue with multi-GPU environments](https://community.amd.com/t5/knowledge-base/iommu-advisory-for-multi-gpu-environments/ta-p/477468) that some multi-GPU environments fail due to limitations of the IOMMU enablement, so it may be needed to explicitly enable/disable the IOMMU using the kernel command-line parameter `iommu=pt/off`.

 ## Demonstrated API Calls

--- a/HIP-Basic/multi_gpu_data_transfer/main.hip
+++ b/HIP-Basic/multi_gpu_data_transfer/main.hip
@ -24,8 +24,10 @@
				@@ -24,8 +24,10 @@

 #include <hip/hip_runtime.h>

+#include <cmath>
 #include <iostream>
 #include <numeric>
+#include <utility>
 #include <vector>

 /// \brief Checks whether peer-to-peer is supported or not among the current available devices.
@ -40,7 +42,7 @@ std::pair<int, int> check_peer_to_peer_support()
				@@ -40,7 +42,7 @@ std::pair<int, int> check_peer_to_peer_support()
    if(gpu_count < 2)
    {
        std::cout << "Peer-to-peer application requires at least 2 GPU devices." << std::endl;
-        exit(hipSuccess);
+        exit(0);
    }

    // Check accessibility for each device available.
@ -59,9 +61,9 @@ std::pair<int, int> check_peer_to_peer_support()
				@@ -59,9 +61,9 @@ std::pair<int, int> check_peer_to_peer_support()
        }
    }
    // No pair of devices supporting peer-to-peer between them has been found.
-    std::cout << "Peer-to-peer application requires at least 2 GPU devices accesible between them."
+    std::cout << "Peer-to-peer application requires at least 2 GPU devices accessible between them."
              << std::endl;
-    exit(hipSuccess);
+    exit(0);
 }

 /// \brief Enables (if possible) direct memory access from <tt>current_gpu<\tt> to <tt>peer_gpu<\tt>.
@ -99,12 +101,12 @@ void disable_peer_to_peer(const unsigned int current_gpu, const unsigned int pee
				@@ -99,12 +101,12 @@ void disable_peer_to_peer(const unsigned int current_gpu, const unsigned int pee
 }

 /// \brief Simple matrix transpose kernel using static shared memory.
-template<const unsigned int Width = 32>
+template<const unsigned int Width = 32, const unsigned int Height = 32>
 __global__ void static_shared_matrix_transpose_kernel(float* out, const float* in)
 {
    // Allocate the necessary amount of shared memory to store the transpose of the matrix.
    // Note that the amount of shared memory needed is known at compile time.
-    constexpr unsigned int size = Width * Width;
+    constexpr unsigned int size = Width * Height;
    __shared__ float       shared_matrix_memory[size];

    // Compute the row and column indexes of the matrix element that each thread is going
@ -112,27 +114,27 @@ __global__ void static_shared_matrix_transpose_kernel(float* out, const float* i
				@@ -112,27 +114,27 @@ __global__ void static_shared_matrix_transpose_kernel(float* out, const float* i
    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;

-    // If not out of bounds, transpose element (x,y).
-    if(x < Width && y < Width)
+    // If out of bounds, do nothing.
+    if(!(x < Width && y < Height))
    {
-        // Store transposed element in shared memory.
-        shared_matrix_memory[y * Width + x] = in[x * Width + y];
+        return;
    }

+    // Store transposed element (x,y) in shared memory.
+    shared_matrix_memory[y * Width + x] = in[x * Height + y];
+
    // Synchronize threads so all writes are done before accessing shared memory again.
    __syncthreads();

-    // If not out of bounds, transpose element (x,y).
-    if(x < Width && y < Width)
-    {
-        // Copy transposed element from shared memory to global memory.
-        out[y * Width + x] = shared_matrix_memory[y * Width + x];
-    }
+    // Copy transposed element from shared to global memory (output matrix).
+    out[y * Width + x] = shared_matrix_memory[y * Width + x];
 }

 /// \brief Simple matrix transpose kernel using dynamic shared memory.
-__global__ void
-    dynamic_shared_matrix_transpose_kernel(float* out, const float* in, const unsigned int width)
+__global__ void dynamic_shared_matrix_transpose_kernel(float*             out,
+                                                       const float*       in,
+                                                       const unsigned int width,
+                                                       const unsigned int height)
 {
    // Declare that this kernel is using dynamic shared memory to store a number of floats.
    // The unsized array type indicates that the total amount of memory that is going
@ -145,22 +147,20 @@ __global__ void
				@@ -145,22 +147,20 @@ __global__ void
    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;

-    // If not out of bounds, transpose element (x,y).
-    if(x < width && y < width)
+    // If out of bounds, do nothing.
+    if(!(x < width && y < height))
    {
-        // Store transposed element in shared memory.
-        shared_matrix_memory[y * width + x] = in[x * width + y];
+        return;
    }

+    // Store transposed element (x,y) in shared memory.
+    shared_matrix_memory[y * width + x] = in[x * height + y];
+
    // Synchronize threads so all writes are done before accessing shared memory again.
    __syncthreads();

-    // If not out of bounds, transpose element (x,y).
-    if(x < width && y < width)
-    {
-        // Copy transposed element from shared memory to global memory.
-        out[y * width + x] = shared_matrix_memory[y * width + x];
-    }
+    // Copy transposed element from shared to global memory (output matrix).
+    out[y * width + x] = shared_matrix_memory[y * height + x];
 }

 int main()
@ -174,19 +174,22 @@ int main()
				@@ -174,19 +174,22 @@ int main()

    // Number of rows and columns, total number of elements and size in bytes of the matrix
    // to be transposed.
-    constexpr unsigned int width      = 32;
-    constexpr unsigned int size       = width * width;
+    constexpr unsigned int width      = 4;
+    constexpr unsigned int height     = width;
+    constexpr unsigned int size       = width * height;
    constexpr size_t       size_bytes = size * sizeof(float);

    // Number of threads in each dimension of the kernel block.
    constexpr unsigned int block_size = 4;

-    // Number of blocks in each dimension of the grid. Calculated as ceiling(width/block_size).
-    constexpr unsigned int grid_size = (width + block_size - 1) / block_size;
+    // Number of blocks in each dimension of the grid. Calculated as
+    // ceiling(matrix_dimension/block_size) with matrix_dimension being width or height.
+    constexpr unsigned int grid_size_x = ceiling_div(width, block_size);
+    constexpr unsigned int grid_size_y = ceiling_div(height, block_size);

    // Block and grid sizes in 2D.
    const dim3 block_dim(block_size, block_size);
-    const dim3 grid_dim(grid_size, grid_size);
+    const dim3 grid_dim(grid_size_x, grid_size_y);

    // Allocate host input matrix and initialize with increasing sequence 1, 2, 3, ....
    std::vector<float> matrix(size);
@ -213,7 +216,7 @@ int main()
				@@ -213,7 +216,7 @@ int main()

    // Launch kernel in current device. Note that, as this kernel uses static shared memory, no
    // bytes of shared memory need to be allocated when launching the kernel.
-    static_shared_matrix_transpose_kernel<width>
+    static_shared_matrix_transpose_kernel<width, height>
        <<<grid_dim, block_dim, 0 /*shared_memory_bytes*/, hipStreamDefault>>>(
            d_transposed_matrix[0],
            d_matrix[0]);
@ -248,7 +251,8 @@ int main()
				@@ -248,7 +251,8 @@ int main()
                                             size_bytes /*shared_memory_bytes*/,
                                             hipStreamDefault>>>(d_transposed_matrix[1],
                                                                 d_matrix[1],
-                                                                 width);
+                                                                 width,
+                                                                 height);

    // Wait on all active streams on the current device.
    HIP_CHECK(hipDeviceSynchronize());
@ -281,7 +285,7 @@ int main()
				@@ -281,7 +285,7 @@ int main()
    if(errors)
    {
        std::cout << "Validation failed with " << errors << " errors." << std::endl;
-        return hipSuccess;
+        return error_exit_code;
    }
    else
    {
--- a/HIP-Basic/occupancy/main.hip
+++ b/HIP-Basic/occupancy/main.hip
@ -24,10 +24,8 @@
				@@ -24,10 +24,8 @@

 #include <hip/hip_runtime.h>

-#include <algorithm>
 #include <iomanip>
 #include <iostream>
-#include <string>
 #include <vector>

 /// \brief Performs pair-wise multiplication of two vectors and stores the result into
@ -58,9 +56,10 @@ void print_occupancy(const int block_size)
				@@ -58,9 +56,10 @@ void print_occupancy(const int block_size)
                                                               pairwise_product_kernel,
                                                               block_size,
                                                               0));
-        std::cout << std::setprecision(3) << "Theoretical Occupancy is "
-                  << static_cast<double>(num_blocks) * block_size
-                         / dev_prop.maxThreadsPerMultiProcessor * 100
+        std::cout << "Theoretical Occupancy is "
+                  << double_precision(static_cast<double>(num_blocks) * block_size
+                                          / dev_prop.maxThreadsPerMultiProcessor * 100,
+                                      3)
                  << "%" << std::endl;
    }
 }
@ -97,8 +96,7 @@ void deploy_kernel_manual_parameters(float*             d_C,
				@@ -97,8 +96,7 @@ void deploy_kernel_manual_parameters(float*             d_C,

    float event_ms;
    HIP_CHECK(hipEventElapsedTime(&event_ms, start, stop));
-    std::cout << std::setprecision(2) << "Kernel Execution Time: " << event_ms << " ms"
-              << std::endl;
+    std::cout << "Kernel Execution Time: " << double_precision(event_ms, 2) << " ms" << std::endl;

    // Destroy the events
    HIP_CHECK(hipEventDestroy(start));
@ -145,8 +143,7 @@ void deploy_kernel_automatic_parameters(float*             d_C,
				@@ -145,8 +143,7 @@ void deploy_kernel_automatic_parameters(float*             d_C,

    float event_ms;
    HIP_CHECK(hipEventElapsedTime(&event_ms, start, stop));
-    std::cout << std::setprecision(2) << "Kernel Execution Time: " << event_ms << " ms"
-              << std::endl;
+    std::cout << "Kernel Execution Time: " << double_precision(event_ms, 2) << " ms" << std::endl;

    // Destroy the events
    HIP_CHECK(hipEventDestroy(start));
--- a/HIP-Basic/opengl_interop/README.md
+++ b/HIP-Basic/opengl_interop/README.md
@ -20,9 +20,39 @@ External device resources and other handles can be shared with HIP in order to p
				@@ -20,9 +20,39 @@ External device resources and other handles can be shared with HIP in order to p

 ## Dependencies
 This example has additional library dependencies besides HIP:
- [GLFW3](https://glfw.org). GLFW can be installed either through the package manager, or can be obtained from its home page. If using CMake, the `glfw3Config.cmake` file must be in a path that CMake searches by default or must be passed using `-DCMAKE_MODULE_PATH`.
-The official GLFW3 binaries do not ship this file on Windows, and so GLFW3 must either be compiled manually. CMake will be able to find GLFW on Windows if it is installed in `C:\Program Files(x86)\glfw\`. Alternatively, GLFW can be obtained from [vcpkg](https://vcpkg.io/), which does ship the required cmake files. In this case, the vcpkg toolchain path should be passed to CMake using `-DCMAKE_TOOLCHAIN_FILE="/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake"`.
-If using Visual Studio, the easiest way to obtain GLFW is by installing glfw3 from vcpkg. Alternatively, the appropriate path to the GLFW3 library and header directories can be set in Properties->Linker->General->Additional Library Directories and Properties->C/C++->General->Additional Include Directories. When using this method, the appropriate name for the glfw library should also be updated under Properties->C/C++->Linker->Input->Additional Dependencies.
+- [GLFW](https://glfw.org). There are three options for getting this dependency satisfied:
+    1. Install it through a package manager. Available for Linux, where GLFW can be installed from some of the usual package managers:
+        - APT: `apt-get install libglfw3-dev`
+        - Pacman: `pacman -S glfw-x11` or `pacman -S glfw-wayland`
+        - DNF: `dnf install glfw-devel`
+
+        It could also happen that the `Xxf68vm` and `Xi` libraries required when linking against Vulkan are not installed. They can be found as well on the previous package managers:
+        - APT: `apt-get install libxxf86vm-dev libxi-dev`
+        - Pacman: `pacman -S libxi libxxf86vm`
+        - DNF: `dnf install libXi-devel libXxf86vm-devel`
+    2. Build from source. GLFW supports compilation on Windows with Visual C++ (2010 and later), MinGW and MinGW-w64 and on Linux and other Unix-like systems with GCC and Clang. Please refer to the [compile guide](https://www.glfw.org/docs/latest/compile.html) for a complete guide on how to do this. Note: not only it should be built as explained in the guide, but it is additionally needed to build with the install target (`cmake --build <build-folder> --target install`).
+    3. Get the pre-compiled binaries from its [download page](https://www.glfw.org/download). Available for Windows.
+
+    Depending on the build tool used, some extra steps may be needed:
+    - If using CMake, the `glfw3Config.cmake` and `glfw3Targets.cmake` files must be in a path that CMake searches by default or must be passed using `-DCMAKE_MODULE_PATH`. The official GLFW3 binaries do not ship these files on Windows, and so GLFW must either be compiled manually or obtained from [vcpkg](https://vcpkg.io/), which does ship the required cmake files.
+      - If the former approach is selected, CMake will be able to find GLFW on Windows if the environment variable `GLFW3_DIR` (or the cmake option `-DCMAKE_PREFIX_PATH`) is set to (contain) the folder owning `glfw3Config.cmake` and `glfw3Targets.cmake`. For instance, if GLFW was installed in `C:\Program Files(x86)\GLFW\`, this will most surely be something like `C:\Program Files (x86)\GLFW\lib\cmake\glfw3\`.
+      - If the latter, the vcpkg toolchain path should be passed to CMake using `-DCMAKE_TOOLCHAIN_FILE="/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake"`.
+    - If using Visual Studio, the easiest way to obtain GLFW is by installing `glfw3` from vcpkg. Alternatively, the appropriate path to the GLFW3 library and header directories can be set in `Properties->Linker->General->Additional Library Directories` and `Properties->C/C++->General->Additional Include Directories`. When using this method, the appropriate name for the GLFW library should also be updated under `Properties->C/C++->Linker->Input->Additional Dependencies`. For instance, if the path to the root folder of the Windows binaries installation was `C:\glfw-3.3.8.bin.WIN64\` and we set `GLFW_DIR` with this path, the project configuration file (`.vcxproj`) should end up containing something similar to the following:
+        ```
+        <ItemDefinitionGroup>
+          <ClCompile>
+            ...
+            <AdditionalIncludeDirectories>$(GLFW_DIR)\include\;<other_include_directories>;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+            ...
+          </ClCompile>
+          <Link>
+            ...
+            <AdditionalDependencies>glfw3dll.lib;<other_dependencies>;%(AdditionalDependencies)</AdditionalDependencies>
+            <AdditionalLibraryDirectories>$(GLFW_DIR)\lib;<other_library_directories><AdditionalLibraryDirectories>
+            ...
+          </Link>
+        </ItemDefinitionGroup>
+        ```

 ## Key APIs and Concepts
 - `hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount, hipGLDeviceList deviceList)` can be used to query which HIP devices can be used to share resources with the current OpenGL context. A device returned by this function must be selected using `hipSetDevice` or a stream must be created from such a device before OpenGL interop is possible.
--- a/HIP-Basic/opengl_interop/main.hip
+++ b/HIP-Basic/opengl_interop/main.hip
@ -367,10 +367,9 @@ struct renderer
				@@ -367,10 +367,9 @@ struct renderer
        {
            const auto time_diff_sec
                = std::chrono::duration_cast<std::chrono::duration<float>>(time_diff).count();
-            std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) << time_diff_sec
-                      << " seconds): " << std::fixed << std::setprecision(2)
-                      << this->fps_frame / time_diff_sec << " (" << std::fixed
-                      << std::setprecision(2) << (time_diff_sec * 1000) / this->fps_frame
+            std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
+                      << " seconds): " << double_precision(this->fps_frame / time_diff_sec, 2, true)
+                      << " (" << double_precision((time_diff_sec * 1000) / this->fps_frame, 2, true)
                      << " ms per frame, " << this->fps_frame << " frames)" << std::endl;
            this->fps_frame      = 0;
            this->fps_start_time = frame_time;
@ -478,8 +477,8 @@ struct simulator
				@@ -478,8 +477,8 @@ struct simulator
        constexpr size_t tile_size = 8;

        // Launch the HIP kernel to advance the simulation.
-        sinewave_kernel<<<dim3((grid_width + tile_size - 1) / tile_size,
-                               (grid_height + tile_size - 1) / tile_size),
+        sinewave_kernel<<<dim3(ceiling_div(grid_width, tile_size),
+                               ceiling_div(grid_height, tile_size)),
                          dim3(tile_size, tile_size),
                          0,
                          this->hip_stream>>>(this->hip_height_ptr, time);
--- a/HIP-Basic/opengl_interop/nvidia_hip_fix.hpp
+++ b/HIP-Basic/opengl_interop/nvidia_hip_fix.hpp
@ -32,32 +32,6 @@ hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource_t* const resource,
				@@ -32,32 +32,6 @@ hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource_t* const resource,
    return hipCUDAErrorTohipError(cudaGraphicsGLRegisterBuffer(resource, buffer, flags));
 }

-hipError_t hipGraphicsMapResources(const int                    count,
-                                   hipGraphicsResource_t* const resources,
-                                   const hipStream_t            stream = 0)
-{
-    return hipCUDAErrorTohipError(cudaGraphicsMapResources(count, resources, stream));
-}
-
-hipError_t hipGraphicsResourceGetMappedPointer(void** const                 dev_ptr,
-                                               size_t* const                size,
-                                               const cudaGraphicsResource_t resource)
-{
-    return hipCUDAErrorTohipError(cudaGraphicsResourceGetMappedPointer(dev_ptr, size, resource));
-}
-
-hipError_t hipGraphicsUnmapResources(const int                    count,
-                                     hipGraphicsResource_t* const resources,
-                                     const hipStream_t            stream = 0)
-{
-    return hipCUDAErrorTohipError(cudaGraphicsUnmapResources(count, resources, stream));
-}
-
-hipError_t hipGraphicsUnregisterResource(const hipGraphicsResource_t resource)
-{
-    return hipCUDAErrorTohipError(cudaGraphicsUnregisterResource(resource));
-}
-
 #endif

 #endif
--- a/HIP-Basic/runtime_compilation/main.hip
+++ b/HIP-Basic/runtime_compilation/main.hip
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@ -36,9 +36,10 @@ static constexpr auto saxpy_kernel{
				@@ -36,9 +36,10 @@ static constexpr auto saxpy_kernel{
    // if the /E flag is passed (as NVCC does).
    "#include \"test_header.h\"\n"
    "#include \"test_header1.h\"\n"
+    // MSVC 19.16 does not properly preprocess extern "C" when it's not on the same line as the
+    // function declaration if the /E flag is passed (as NVCC does).
    R"(
-extern "C"
-__global__ void saxpy_kernel(const real a, const realptr d_x, realptr d_y, const unsigned int size)
+extern "C" __global__ void saxpy_kernel(const real a, const realptr d_x, realptr d_y, const unsigned int size)
 {
    const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(global_idx < size)
@ -136,7 +137,7 @@ int main()
				@@ -136,7 +137,7 @@ int main()
    constexpr unsigned int block_size = 128;

    // Number of blocks per kernel grid, calculated as ceil(size/block_size).
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // Constant value 'a' to be used in the expression 'a*x+y'.
    constexpr float a = 5.1f;
--- a/HIP-Basic/saxpy/main.hip
+++ b/HIP-Basic/saxpy/main.hip
@ -56,7 +56,7 @@ int main()
				@@ -56,7 +56,7 @@ int main()
    constexpr unsigned int block_size = 256;

    // Number of blocks per kernel grid. The expression below calculates ceil(size/block_size).
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // The constant value to use in the a*x+y formula.
    constexpr float a = 2.f;
--- a/HIP-Basic/shared_memory/main.hip
+++ b/HIP-Basic/shared_memory/main.hip
@ -87,7 +87,7 @@ int main()
				@@ -87,7 +87,7 @@ int main()
    constexpr unsigned int block_size = 4;

    // Number of blocks in each dimension of the grid. Calculated as ceil(width/block_size).
-    constexpr unsigned int grid_size = (width + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(width, block_size);

    // Block and grid sizes in 2D.
    const dim3 block_dim(block_size, block_size);
--- a/HIP-Basic/static_device_library/main.hip
+++ b/HIP-Basic/static_device_library/main.hip
@ -56,7 +56,7 @@ int main()
				@@ -56,7 +56,7 @@ int main()
    constexpr unsigned int block_size = 256;

    // The number of blocks per kernel grid. The expression below calculates `ceil(size / block_size)`.
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // Allocate host input vector and fill it with an increasing sequence (i.e. 0, 1, 2, ...).
    std::vector<int> in(size);
--- a/HIP-Basic/static_host_library/library/library.hip
+++ b/HIP-Basic/static_host_library/library/library.hip
@ -57,7 +57,7 @@ int run_test()
				@@ -57,7 +57,7 @@ int run_test()
    constexpr unsigned int block_size = 256;

    // The number of blocks per kernel grid. The expression below calculates `ceil(size / block_size)`.
-    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+    constexpr unsigned int grid_size = ceiling_div(size, block_size);

    // Allocate host input vector and fill it with an increasing sequence (i.e. 0, 1, 2, ...).
    std::vector<uint32_t> in(size);
--- a/HIP-Basic/vulkan_interop/README.md
+++ b/HIP-Basic/vulkan_interop/README.md
@ -54,10 +54,60 @@ To signal a shared semaphore in HIP, the `hipSignalExternalSemaphoresAsync` func
				@@ -54,10 +54,60 @@ To signal a shared semaphore in HIP, the `hipSignalExternalSemaphoresAsync` func

 ## Dependencies
 This example has additional library dependencies besides HIP:
- [GLFW3](https://glfw.org). GLFW can be installed either through the package manager, or can be obtained from its home page. If using CMake, the `glfw3Config.cmake` file must be in a path that CMake searches by default or must be passed using `-DCMAKE_MODULE_PATH`.
-The official GLFW3 binaries do not ship this file on Windows, and so GLFW3 must either be compiled manually. CMake will be able to find GLFW on Windows if it is installed in `C:\Program Files(x86)\glfw\`. Alternatively, GLFW can be obtained from [vcpkg](https://vcpkg.io/), which does ship the required cmake files. In this case, the vcpkg toolchain path should be passed to CMake using `-DCMAKE_TOOLCHAIN_FILE="/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake"`.
-If using Visual Studio, the easiest way to obtain GLFW is by installing glfw3 from vcpkg. Alternatively, the appropriate path to the GLFW3 library and header directories can be set in Properties->Linker->General->Additional Library Directories and Properties->C/C++->General->Additional Include Directories. When using this method, the appropriate name for the glfw library should also be updated under Properties->C/C++->Linker->Input->Additional Dependencies.
- Vulkan headers, validation layers, and `glslangValidator` are required. The easiest way to obtain this is by installing the [LunarG Vulkan SDK](https://vulkan.lunarg.com/). CMake will be able to find the SDK using the `VULKAN_SDK` environment variable, which is set by default using the SDK activation script on Linux. On Windows, this environment variable is not automatically provided, and so should be set to the appropriate path before invoking CMake. The Visual Studio projects will automatically pick up `VULKAN_SDK`. Alternatively, the required Vulkan components can be installed through the system package manager. Note that libvulkan is _not_ required, the example loads function pointers dynamically.
+- [GLFW](https://glfw.org). There are three options for getting this dependency satisfied:
+    1. Install it through a package manager. Available for Linux, where GLFW can be installed from some of the usual package managers:
+        - APT: `apt-get install libglfw3-dev`
+        - Pacman: `pacman -S glfw-x11` or `pacman -S glfw-wayland`
+        - DNF: `dnf install glfw-devel`
+
+        It could also happen that the `Xxf68vm` and `Xi` libraries required when linking against Vulkan are not installed. They can be found as well on the previous package managers:
+        - APT: `apt-get install libxxf86vm-dev libxi-dev`
+        - Pacman: `pacman -S libxi libxxf86vm`
+        - DNF: `dnf install libXi-devel libXxf86vm-devel`
+    2. Build from source. GLFW supports compilation on Windows with Visual C++ (2010 and later), MinGW and MinGW-w64 and on Linux and other Unix-like systems with GCC and Clang. Please refer to the [compile guide](https://www.glfw.org/docs/latest/compile.html) for a complete guide on how to do this. Note: not only it should be built as explained in the guide, but it is additionally needed to build with the install target (`cmake --build <build-folder> --target install`).
+    3. Get the pre-compiled binaries from its [download page](https://www.glfw.org/download). Available for Windows.
+
+    Depending on the build tool used, some extra steps may be needed:
+    - If using CMake, the `glfw3Config.cmake` and `glfw3Targets.cmake` files must be in a path that CMake searches by default or must be passed using `-DCMAKE_MODULE_PATH`. The official GLFW3 binaries do not ship these files on Windows, and so GLFW must either be compiled manually or obtained from [vcpkg](https://vcpkg.io/), which does ship the required cmake files.
+      - If the former approach is selected, CMake will be able to find GLFW on Windows if the environment variable `GLFW3_DIR` (or the cmake option `-DCMAKE_PREFIX_PATH`) is set to (contain) the folder owning `glfw3Config.cmake` and `glfw3Targets.cmake`. For instance, if GLFW was installed in `C:\Program Files(x86)\GLFW\`, this will most surely be something like `C:\Program Files (x86)\GLFW\lib\cmake\glfw3\`.
+      - If the latter, the vcpkg toolchain path should be passed to CMake using `-DCMAKE_TOOLCHAIN_FILE="/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake"`.
+    - If using Visual Studio, the easiest way to obtain GLFW is by installing `glfw3` from vcpkg. Alternatively, the appropriate path to the GLFW3 library and header directories can be set in `Properties->Linker->General->Additional Library Directories` and `Properties->C/C++->General->Additional Include Directories`. When using this method, the appropriate name for the GLFW library should also be updated under `Properties->C/C++->Linker->Input->Additional Dependencies`. For instance, if the path to the root folder of the Windows binaries installation was `C:\glfw-3.3.8.bin.WIN64\` and we set `GLFW_DIR` with this path, the project configuration file (`.vcxproj`) should end up containing something similar to the following:
+        ```
+        <ItemDefinitionGroup>
+          <ClCompile>
+            ...
+            <AdditionalIncludeDirectories>$(GLFW_DIR)\include\;<other_include_directories>;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+            ...
+          </ClCompile>
+          <Link>
+            ...
+            <AdditionalDependencies>glfw3dll.lib;<other_dependencies>;%(AdditionalDependencies)</AdditionalDependencies>
+            <AdditionalLibraryDirectories>$(GLFW_DIR)\lib;<other_library_directories><AdditionalLibraryDirectories>
+            ...
+          </Link>
+        </ItemDefinitionGroup>
+        ```
+- Vulkan headers. On Linux, the vulkan headers can be directly obtained from some package managers:
+    - Linux
+        - APT: `apt-get install -y libvulkan-dev`
+        - Pacman: `pacman -S vulkan-headers vulkan-icd-loader`
+        - DNF: `dnf install vulkan-headers vulkan-icd-loader`
+
+    But they may be as well obtained by installing the [LunarG Vulkan SDK](https://vulkan.lunarg.com/). CMake will be able to find the SDK using the `VULKAN_SDK` environment variable, which is set by default using the SDK activation script.
+
+    On Windows, on the other hand, the headers can only be obtained from the [LunarG Vulkan SDK](https://vulkan.lunarg.com/). Contrary to Unix-based OSs, the `VULKAN_SDK` environment variable is not automatically provided on Windows, and so it should be set to the appropriate path before invoking CMake.
+
+    Note that `libvulkan` is _not_ required, as the example loads function pointers dynamically.
+
+- Validation layers. The `VK_LAYER_KHRONOS_validation` layer is active by default to perform general checks on Vulkan, thus the [Khronos' Vulkan Validation Layers (VVL)](https://github.com/KhronosGroup/Vulkan-ValidationLayers/tree/main#vulkan-validation-layers-vvl) will need to be installed on the system if such checks are desirable. It can be either installed from a package manager (on Linux), built and configured from source or installed as part of the [LunarG Vulkan SDK](https://vulkan.lunarg.com/).
+Package managers offering the validation layers package include:
+    - APT: `apt install vulkan-validationlayers-dev`
+    - Pacman: `pacman -S vulkan-validation-layers`. Note that with pacman both the validation layers and headers (among others) can be also installed with `pacman -S vulkan-devel`.
+    - DNF: `dnf install vulkan-validation-layers`
+
+    For the second approach, build instructions are provided on [Khronos Vulkan-ValidationLayers repository](https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/main/BUILD.md) and Vulkan's [Layers Overwiew and Configuration](https://vulkan.lunarg.com/doc/view/latest/windows/layer_configuration.html) document offers several approaches for its configuration.
+
+- `glslangValidator`. It is used in the example as a shader validation tool. It may be installed via package manager (`sudo apt install glslang-tools`), by [building manually from source](https://github.com/KhronosGroup/glslang#building-cmake), by downloading the binaries for the corresponding platform directly from the [main-tot](https://github.com/KhronosGroup/glslang/releases/tag/main-tot) release on GitHub or installed as part of the [LunarG Vulkan SDK](https://vulkan.lunarg.com/).

 ## Demonstrated API Calls
 ### HIP runtime
--- a/HIP-Basic/vulkan_interop/main.hip
+++ b/HIP-Basic/vulkan_interop/main.hip
@ -455,9 +455,9 @@ hipExternalMemory_t
				@@ -455,9 +455,9 @@ hipExternalMemory_t
    desc.type = hipExternalMemoryHandleTypeOpaqueFd;

    VkMemoryGetFdInfoKHR get_fd_info = {};
-    get_fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
-    get_fd_info.memory = memory;
-    get_fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    get_fd_info.sType                = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+    get_fd_info.memory               = memory;
+    get_fd_info.handleType           = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;

    VK_CHECK(ctx.vkd->get_memory_fd(ctx.dev, &get_fd_info, &desc.handle.fd));
 #endif
@ -533,9 +533,9 @@ hipExternalSemaphore_t semaphore_to_hip(const graphics_context& ctx, const VkSem
				@@ -533,9 +533,9 @@ hipExternalSemaphore_t semaphore_to_hip(const graphics_context& ctx, const VkSem
    desc.type = hipExternalSemaphoreHandleTypeOpaqueFd;

    VkSemaphoreGetFdInfoKHR get_fd_info = {};
-    get_fd_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
-    get_fd_info.semaphore = sema;
-    get_fd_info.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    get_fd_info.sType                   = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+    get_fd_info.semaphore               = sema;
+    get_fd_info.handleType              = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;

    VK_CHECK(ctx.vkd->get_semaphore_fd(ctx.dev, &get_fd_info, &desc.handle.fd));
 #endif
@ -1163,8 +1163,8 @@ struct renderer
				@@ -1163,8 +1163,8 @@ struct renderer
        constexpr size_t tile_size = 8;

        // Launch the HIP kernel to advance the simulation.
-        sinewave_kernel<<<dim3((grid_width + tile_size - 1) / tile_size,
-                               (grid_height + tile_size - 1) / tile_size),
+        sinewave_kernel<<<dim3(ceiling_div(grid_width, tile_size),
+                               ceiling_div(grid_height, tile_size)),
                          dim3(tile_size, tile_size),
                          0,
                          this->hip_stream>>>(this->hip_height_buffer, time);
@ -1249,10 +1249,10 @@ struct renderer
				@@ -1249,10 +1249,10 @@ struct renderer
            const auto time_diff_sec
                = std::chrono::duration_cast<std::chrono::duration<float>>(time_diff).count();
            const uint32_t frames = this->frame_index - this->fps_start_frame;
-            std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) << time_diff_sec
-                      << " seconds): " << std::fixed << std::setprecision(2)
-                      << frames / time_diff_sec << " (" << std::fixed << std::setprecision(2)
-                      << (time_diff_sec * 1000) / frames << " ms per frame)" << std::endl;
+            std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
+                      << " seconds): " << double_precision(frames / time_diff_sec, 2, true) << " ("
+                      << double_precision((time_diff_sec * 1000) / frames, 2, true)
+                      << " ms per frame)" << std::endl;
            this->fps_start_frame = this->frame_index;
            this->fps_start_time  = frame_time;
        }
--- a/Libraries/rocPRIM/block_sum/main.hip
+++ b/Libraries/rocPRIM/block_sum/main.hip
@ -98,7 +98,7 @@ int main()
				@@ -98,7 +98,7 @@ int main()

    // Input problem size
    constexpr unsigned int size      = 10000;
-    const unsigned int     grid_size = (size + items_per_block - 1) / items_per_block;
+    const unsigned int     grid_size = ceiling_div(size, items_per_block);

    // Allocate input vector on the host
    std::vector<int> h_in(size);
--- a/Scripts/WindowsRunner.ps1
+++ b/Scripts/WindowsRunner.ps1
@ -2,7 +2,7 @@ param(
				@@ -2,7 +2,7 @@ param(
    [Parameter(Mandatory)]
    [string]$Path = "Debug",
    [string]$Filter = "*.exe",
-    [int]$Timeout = 10,
+    [int]$Timeout = 60,
    [string[]]$Skip = @()
 )
 $Skip = $Skip | ForEach-Object { $_.Trim() }