Develop Stream: update to ROCm 6.2 (#165)

* update docker images and CI to ROCm 6.2.0 * fix rocfft brick bounds parameter order * update formatting for rocm 6.2 --------- Co-authored-by: Robin Voetter <robin@streamhpc.com>
10 months ago · 0fdfd7f6e7
7 changed files with 23 additions and 23 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -39,7 +39,7 @@ variables:
				@@ -39,7 +39,7 @@ variables:
  HIP_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror"
  # Keep in sync with ROCM_VERSION in Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
  # and Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
-  DOCKER_ROCM_VERSION: 6.1.0
+  DOCKER_ROCM_VERSION: 6.2.0
  DOCKER_HIP_LIBRARIES_ROCM_TAG: rocm-ubuntu-${DOCKER_ROCM_VERSION}
  DOCKER_HIP_LIBRARIES_CUDA_TAG: cuda-ubuntu-${DOCKER_ROCM_VERSION}
  DOCKER_HIP_LIBRARIES_ROCM: $DOCKER_TAG_PREFIX:$DOCKER_HIP_LIBRARIES_ROCM_TAG
--- a/Common/example_utils.hpp
+++ b/Common/example_utils.hpp
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
--- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
@ -2,11 +2,11 @@
				@@ -2,11 +2,11 @@
 # Above is required for substitutions in environment variables

 # CUDA based docker image
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.6.0-devel-ubuntu22.04

 # The ROCm versions that this image is based of.
 # Always write this down as major.minor.patch
-ENV ROCM_VERSION=6.1.0
+ENV ROCM_VERSION=6.2.0
 ENV ROCM_VERSION_APT=${ROCM_VERSION%.0}

 # Base packages that are required for the installation
@ -53,10 +53,14 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
				@@ -53,10 +53,14 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
 ENV HIP_COMPILER=nvcc HIP_PLATFORM=nvidia HIP_RUNTIME=cuda

 # Install rocRAND
+# We need to apply this patch to make it work on Nvidia for ROCm 6.2: https://github.com/ROCm/rocRAND/commit/7ec5fda5243e599d83af841b5c38198a2f7f05fa
 RUN wget https://github.com/ROCm/rocRAND/archive/refs/tags/rocm-${ROCM_VERSION}.tar.gz -O rocrand.tar.gz \
    && mkdir rocrand \
    && tar -xf ./rocrand.tar.gz --strip-components 1 -C rocrand \
    && rm ./rocrand.tar.gz \
+    && wget https://github.com/ROCm/rocRAND/commit/7ec5fda5243e599d83af841b5c38198a2f7f05fa.patch -O rocrand.patch \
+    && patch -p1 -d rocrand < ./rocrand.patch \
+    && rm rocrand.patch \
    && cmake -S ./rocrand -B ./rocrand/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \
        -D BUILD_HIPRAND=OFF \
@ -89,12 +93,11 @@ RUN wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-${ROCM_VERSION}.
				@@ -89,12 +93,11 @@ RUN wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-${ROCM_VERSION}.
    && rm -rf ./hipblas

 # Install hipSOLVER
-# hipSOLVER cmake for rocm-6.1.0 is broken added CXXFLAGS=-D__HIP_PLATFORM_NVIDIA__ as fix
 RUN wget https://github.com/ROCm/hipSOLVER/archive/refs/tags/rocm-${ROCM_VERSION}.tar.gz -O hipsolver.tar.gz \
    && mkdir hipsolver \
    && tar -xf ./hipsolver.tar.gz --strip-components 1 -C hipsolver \
    && rm ./hipsolver.tar.gz \
-    && CXXFLAGS=-D__HIP_PLATFORM_NVIDIA__ cmake -S ./hipsolver -B ./hipsolver/build \
+    && cmake -S ./hipsolver -B ./hipsolver/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
        -D USE_CUDA=ON \
@ -102,13 +105,10 @@ RUN wget https://github.com/ROCm/hipSOLVER/archive/refs/tags/rocm-${ROCM_VERSION
				@@ -102,13 +105,10 @@ RUN wget https://github.com/ROCm/hipSOLVER/archive/refs/tags/rocm-${ROCM_VERSION
    && rm -rf ./hipsolver

 # Install hipRAND
-# Manually replace usage of __HIP_PLATFORM_NVCC__ with __HIP_PLATFORM_NVIDIA__. See
-# https://github.com/ROCm/hipRAND/commit/4925f0da96fad5b9f532ddc79f1f52fc279d329f
 RUN wget https://github.com/ROCm/hipRAND/archive/refs/tags/rocm-${ROCM_VERSION}.tar.gz -O hiprand.tar.gz \
    && mkdir hiprand \
    && tar -xf ./hiprand.tar.gz --strip-components 1 -C hiprand \
    && rm ./hiprand.tar.gz \
-    && sed -i s/__HIP_PLATFORM_NVCC__/__HIP_PLATFORM_NVIDIA__/ ./hiprand/library/include/hiprand/hiprand.h \
    && cmake -S ./hiprand -B ./hiprand/build \
        -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \
        -D CMAKE_INSTALL_PREFIX=/opt/rocm \
--- a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
@ -6,7 +6,7 @@ FROM ubuntu:22.04
				@@ -6,7 +6,7 @@ FROM ubuntu:22.04

 # The ROCm versions that this image is based of.
 # Always write this down as major.minor.patch
-ENV ROCM_VERSION=6.1.0
+ENV ROCM_VERSION=6.2.0
 ENV ROCM_VERSION_APT=${ROCM_VERSION%.0}

 # Base packages that are required for the installation
--- a/Libraries/hipFFT/plan_z2z/main.cpp
+++ b/Libraries/hipFFT/plan_z2z/main.cpp
@ -56,9 +56,7 @@ void fft_example(const int dimension, const int size = 4, const int direction =
				@@ -56,9 +56,7 @@ void fft_example(const int dimension, const int size = 4, const int direction =
    std::uniform_real_distribution<double> distribution{};
    std::generate(input.begin(),
                  input.end(),
-                  [&]() {
-                      return input_t{distribution(generator), distribution(generator)};
-                  });
+                  [&]() { return input_t{distribution(generator), distribution(generator)}; });

    std::cout << "Input:\n" << std::setprecision(3);
    print_nd_data(input, n, 16);
--- a/Libraries/rocFFT/multi_gpu/main.cpp
+++ b/Libraries/rocFFT/multi_gpu/main.cpp
@ -91,13 +91,16 @@ int main(int argc, char* argv[])
				@@ -91,13 +91,16 @@ int main(int argc, char* argv[])

    // Define infield geometry
    // First entry of upper dimension is the batch size
+    const size_t              batch_size     = 1;
    const std::vector<size_t> inbrick0_lower = {0, 0, 0, 0};
-    const std::vector<size_t> inbrick0_upper = {1, length[0] / deviceCount, length[1], length[2]};
-    const std::vector<size_t> inbrick1_lower = {0, length[0] / deviceCount, 0, 0};
-    const std::vector<size_t> inbrick1_upper = {1, length[0], length[1], length[2]};
+    const std::vector<size_t> inbrick0_upper
+        = {length[0] / deviceCount, length[1], length[2], batch_size};
+    const std::vector<size_t> inbrick1_lower = {length[0] / deviceCount, 0, 0, 0};
+    const std::vector<size_t> inbrick1_upper = {length[0], length[1], length[2], batch_size};

    // Row-major stride for brick data layout in memory
-    std::vector<size_t> brick_stride = {fftSize, length[0] * length[1], length[0], 1};
+    const size_t        idist        = fftSize; // distance between batches
+    std::vector<size_t> brick_stride = {1, length[0] * length[1], length[0], idist};

    rocfft_field infield = nullptr;
    ROCFFT_CHECK(rocfft_field_create(&infield));
@ -145,9 +148,9 @@ int main(int argc, char* argv[])
				@@ -145,9 +148,9 @@ int main(int argc, char* argv[])

    std::vector<void*>        gpu_out(2);
    const std::vector<size_t> outbrick0_lower = {0, 0, 0, 0};
-    const std::vector<size_t> outbrick0_upper = {1, length[0] / deviceCount, length[1], length[2]};
-    const std::vector<size_t> outbrick1_lower = {0, length[0] / deviceCount, 0, 0};
-    const std::vector<size_t> outbrick1_upper = {1, length[0], length[1], length[2]};
+    const std::vector<size_t> outbrick0_upper = {length[0] / deviceCount, length[1], length[2], 1};
+    const std::vector<size_t> outbrick1_lower = {length[0] / deviceCount, 0, 0, 0};
+    const std::vector<size_t> outbrick1_upper = {length[0], length[1], length[2], 1};

    rocfft_brick outbrick0 = nullptr;
    ROCFFT_CHECK(rocfft_brick_create(&outbrick0,
--- a/Libraries/rocRAND/simple_distributions_cpp/main.cpp
+++ b/Libraries/rocRAND/simple_distributions_cpp/main.cpp
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@ -253,8 +253,7 @@ int main(const int argc, const char** argv)
				@@ -253,8 +253,7 @@ int main(const int argc, const char** argv)

    if(number_of_devies <= 0)
    {
-        std::cerr << "HIP supported devices not found!"
-                  << "\n";
+        std::cerr << "HIP supported devices not found!\n";
        exit(error_exit_code);
    }