Browse Source

Develop Stream 2024-03-21 general fixes (part I) (#97)

* bump the required cmake version to 3.21.3

* Fix device_globals example name

* Fix hip_streams timeout on AMD windows debug build type

* Update templates

* Update cuda container to ROCm 5.4

* Change std::bind into lambda

* HIP 5.5 fixes

* fix tests not being executed

* Make the reference to the identity and transpose op uniform

* Fix NVCC CI

* Resolve "Increase timeout for CI"

* Update fixed size arrays to C++ standards

* Add missing include in hip_texture_management

* Remove void** cast from hipMalloc

* Fix hip-libraries-cuda-ubuntu Dockerfile

* Make the windows builds less verbose

* Rework Windows CI

* Skip failing rocsparse tests

* Fix cooperative groups example

* ci: Make skipped examples more prominent in windows VS test runner

* Enable rocsparse examples in CI

* Update .gitlab/issue_templates/example.md

Fix small typo

---------

Co-authored-by: Balint Soproni <balint@streamhpc.com>
Co-authored-by: Robin Voetter <robin@streamhpc.com>
Co-authored-by: Nara Prasetya <nara@streamhpc.com>
Co-authored-by: Nol Moonen <nol@streamhpc.com>
Co-authored-by: Mátyás Aradi <matyas@streamhpc.com>
Co-authored-by: Gergely Mészáros <gergely@streamhpc.com>
Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com>
pull/113/head
Beatriz Navidad Vilches 1 year ago committed by GitHub
parent
commit
95687ef285
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 343
      .gitlab-ci.yml
  2. 20
      .gitlab/issue_templates/example.md
  3. 16
      .gitlab/merge_request_templates/example.md
  4. 4
      Applications/floyd_warshall/main.hip
  5. 4
      CMakeLists.txt
  6. 14
      Common/example_utils.hpp
  7. 75
      Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
  8. 5
      HIP-Basic/cooperative_groups/CMakeLists.txt
  9. 2
      HIP-Basic/device_globals/CMakeLists.txt
  10. 6
      HIP-Basic/device_query/main.cpp
  11. 4
      HIP-Basic/occupancy/main.hip
  12. 5
      HIP-Basic/texture_management/main.hip
  13. 35
      Libraries/hipBLAS/gemm_strided_batched/README.md
  14. 10
      Libraries/hipBLAS/gemm_strided_batched/main.hip
  15. 8
      Libraries/hipCUB/device_radix_sort/main.hip
  16. 6
      Libraries/hipCUB/device_sum/main.hip
  17. 2
      Libraries/hipSOLVER/syevj/main.cpp
  18. 28
      Libraries/rocBLAS/level_3/gemm/README.md
  19. 10
      Libraries/rocBLAS/level_3/gemm/main.cpp
  20. 27
      Libraries/rocBLAS/level_3/gemm_strided_batched/README.md
  21. 10
      Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp
  22. 4
      Libraries/rocPRIM/block_sum/main.hip
  23. 6
      Libraries/rocPRIM/device_sum/main.hip
  24. 6
      Libraries/rocThrust/device_ptr/main.hip
  25. 6
      Libraries/rocThrust/norm/main.hip
  26. 6
      Libraries/rocThrust/reduce_sum/main.hip
  27. 6
      Libraries/rocThrust/remove_points/main.hip
  28. 6
      Libraries/rocThrust/saxpy/main.hip
  29. 6
      Libraries/rocThrust/vectors/main.hip
  30. 108
      Scripts/WindowsRunner.ps1

343
.gitlab-ci.yml

@ -31,8 +31,10 @@ include: @@ -31,8 +31,10 @@ include:
variables:
CUDA_FLAGS: "-Xcompiler -Wall,-Wextra,-Werror --Werror all-warnings"
CXX_FLAGS: "-Wall -Wextra -Werror"
HIP_FLAGS: "-Wall -Wextra -Werror"
# We require '-Wno-unused-command-line-argument' due to the followiwng warning:
# argument unused during compilation: '--rtlib=compiler-rt'
CXX_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror"
HIP_FLAGS: "-Wno-unused-command-line-argument -Wall -Wextra -Werror"
stages:
- lint
@ -56,7 +58,7 @@ clang-format: @@ -56,7 +58,7 @@ clang-format:
- Scripts/CodeFormat/check_format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT"
.build:dockerfiles:
timeout: 20m
timeout: 60m
image:
name: gcr.io/kaniko-project/executor:debug
entrypoint: [""]
@ -94,6 +96,10 @@ build:cuda-ubuntu-dockerfile: @@ -94,6 +96,10 @@ build:cuda-ubuntu-dockerfile:
variables:
TAG: cuda-ubuntu
########################
# Ubuntu make #
########################
build:make-rocm:
image: $DOCKER_TAG_PREFIX:rocm-ubuntu
stage: build
@ -116,6 +122,10 @@ build:make-cuda: @@ -116,6 +122,10 @@ build:make-cuda:
script:
- cd $CI_PROJECT_DIR && make CXXFLAGS="$CUDA_FLAGS" GPU_RUNTIME=CUDA -j $(nproc)
########################
# Ubuntu cmake #
########################
.build:cmake:
stage: build
extends:
@ -150,28 +160,32 @@ build:cmake-rocm: @@ -150,28 +160,32 @@ build:cmake-rocm:
- cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
build:cmake-cuda:
image: $DOCKER_TAG_PREFIX:cuda-ubuntu
extends:
- .build:cmake
tags:
- nvcc-build
script:
- cmake
-S $CI_PROJECT_DIR
-B $CI_PROJECT_DIR/build
-D GPU_RUNTIME=CUDA
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake
2>&1 | tee cmake_log.txt
# check if all dependencies were found
- |-
if grep -qi "could not find" cmake_log.txt; then
echo "Some CMake libraries could not be found"
exit 1
fi
- cmake --build $CI_PROJECT_DIR/build
- cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
image: $DOCKER_TAG_PREFIX:cuda-ubuntu
extends:
- .build:cmake
tags:
- nvcc-build
script:
- cmake
-S $CI_PROJECT_DIR
-B $CI_PROJECT_DIR/build
-D GPU_RUNTIME=CUDA
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake
2>&1 | tee cmake_log.txt
# check if all dependencies were found
- |-
if grep -qi "could not find" cmake_log.txt; then
echo "Some CMake libraries could not be found"
exit 1
fi
- cmake --build $CI_PROJECT_DIR/build
- cmake --install $CI_PROJECT_DIR/build --prefix $CI_PROJECT_DIR/install
########################
# Ubuntu Tests #
########################
.test:
stage: test
@ -196,122 +210,104 @@ test:cuda: @@ -196,122 +210,104 @@ test:cuda:
needs:
- build:cmake-cuda
.test:windows:
extends:
- .rules:test
stage: test
needs: []
parallel:
matrix:
- BUILD_TYPE: [Debug, Release]
########################
# Windows VisualStudio #
########################
.test:rocm-windows:
extends:
- .test:windows
.test:windows-rocm:
tags:
- windows
- shell
- rx6900
.test:windows-vs:
script:
# MSBuild cannot properly resolve the `<Content Include=` dependencies, and will sometimes try to copy
# two or more files at once. This results in a warning before it retries, which is counted towards
# /warnAsError by default. For this reason, we disable the relevant warning (MSB3026).
- >
& $MSBUILD
/maxCpuCount
"/p:Configuration=$BUILD_TYPE"
/warnAsError
/warnAsMessage:MSB3026
$MSBUILD_EXTRA_OPTIONS
"$CI_PROJECT_DIR/$SOLUTION"
.test:windows-nvcc:
tags:
- nvcc-windows
test:rocm-windows-vs2019:
.test:windows-vs:
stage: test
timeout: 30m
extends:
- .test:rocm-windows
- .test:windows-vs
- .rules:test
parallel:
matrix:
- VS_VERSION:
- 2017
- 2019
- 2022
BUILD_TYPE:
- Debug
- Release
variables:
MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-VS2019.sln"
Timeout: 30
Filter: "*_vs$VS_VERSION.exe"
script:
- !reference [".test:windows-vs", script]
- |-
$SkippedExamples = @(
"hip_vulkan_interop_vs2019.exe" # Graphical
"hip_texture_management_vs2019.exe" # Hangs sometimes
"hip_hello_world_vs2019.exe" # Crashes (known driver issue)
- | # Find MSBuild.exe of the associated version.
$MSBUILD = (
& "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -find MSBuild\**\Bin\MSBuild.exe
| Select-String -Pattern $VS_VERSION
)[0]
Write-Output ("MSBuild: $MSBUILD" -f $MSBUILD)
- | # Fixes error MSB8036: The Windows SDK version 8.1 was not found
if ($VS_VERSION -eq 2017) {
$MSBUILD_EXTRA_OPTIONS = "/p:WindowsTargetPlatformVersion=10.0.20348.0"
}
- | # Build!
& $MSBUILD @(
"/clp:Summary;ShowEventId;ShowTimestamp"
"/p:Configuration=$BUILD_TYPE"
"/p:Verbose=false"
"/maxCpuCount:8"
"/p:CL_MPCount=8"
"/verbosity:minimal"
"/validate"
"/warnAsError"
# MSBuild cannot properly resolve the `<Content Include=` dependencies, and will sometimes try to copy
# two or more files at once. This results in a warning before it retries, which is counted towards
# /warnAsError by default. For this reason, we disable the relevant warning (MSB3026).
"/warnAsMessage:MSB3026"
"/t:build"
$MSBUILD_EXTRA_OPTIONS
"$CI_PROJECT_DIR\$SOLUTION_PREFIX$VS_VERSION.sln"
)
Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
ForEach-Object {
if ($SkippedExamples -NotContains $_.Name) {
echo "--" $_.Name
& "$_"
if (!$?) {
throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
}
} else {
echo "-- SKIPPING " $_.Name
}
- | # Use external script to test examples
if (!$SKIP_TESTS) {
& $CI_PROJECT_DIR\Scripts\WindowsRunner.ps1 $CI_PROJECT_DIR\$BUILD_TYPE $Filter $Timeout $("$SkippedExamples".split(','))
} else {
Write-Output "Tests skipped!"
}
test:rocm-windows-vs2017:
test:windows-rocm-vs:
extends:
- .test:rocm-windows
- .test:windows-rocm
- .test:windows-vs
tags:
- windows
- shell
- rx6900
variables:
MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/MSBuild/15.0/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-VS2017.sln"
# See https://developercommunity.visualstudio.com/t/windowstargetplatformversion-makes-it-impossible-t/140294
MSBUILD_EXTRA_OPTIONS: "/p:WindowsTargetPlatformVersion=10.0.20348.0"
SOLUTION_PREFIX: ROCm-Examples-VS
# hip_vulkant_interop: graphical
# hip_texture_management: does not work
# rocsparse_*: broken with new SDK
SkippedExamples: >
hip_vulkan_interop_*.exe,
hip_texture_management_*.exe,
test:rocm-windows-vs2022:
test:windows-nvcc-vs:
extends:
- .test:rocm-windows
- .test:windows-nvcc
- .test:windows-vs
variables:
MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-VS2022.sln"
test:rocm-windows-cmake:
extends:
- .test:rocm-windows
script:
- Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
- Enter-VsDevShell -InstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo'
- cmake
-S "$CI_PROJECT_DIR"
-B "$CI_PROJECT_DIR/build"
-G Ninja
-D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe"
-D CMAKE_HIP_FLAGS="-fuse-ld=lld"
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
-D CMAKE_HIP_FLAGS="$HIP_FLAGS"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_HIP_ARCHITECTURES=gfx1030
-D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake"
2>&1 | Tee-Object -filepath cmake_log.txt
- |-
if (Select-String -Path cmake_log.txt -Pattern "could not find") {
throw "Some cmake libraries are missing"
}
- cmake --build "$CI_PROJECT_DIR/build"
# CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it.
# So for now, just add the library path here.
- $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH
- cd "$CI_PROJECT_DIR/build" && ctest --output-on-failure --timeout 10
- cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
.test:nvcc-windows:
extends:
- .test:windows
tags:
- nvcc-windows
variables:
SOLUTION_PREFIX: ROCm-Examples-Portable-VS
# hip_runtime_compilation: fails on VS2017
SkippedExamples: >
hip_runtime_compilation_vs2017.exe
before_script:
- | # Release builds are currently broken!
$SKIP_TESTS = ($BUILD_TYPE -eq "Release")
# To test for NVIDIA, we need to set the platform toolset to HIP_nvcc. This cannot be done with /p:PlatformToolset
# though, as some examples use the regular msvc toolchain.
- |
@ -321,76 +317,75 @@ test:rocm-windows-cmake: @@ -321,76 +317,75 @@ test:rocm-windows-cmake:
Set-Content $f
}
test:nvcc-windows-vs2019:
########################
# Windows cmake #
########################
.test:windows-cmake:
extends:
- .test:nvcc-windows
- .test:windows-vs
- .rules:test
variables:
MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-Portable-VS2019.sln"
VS_VERSION: 2022
BUILD_TYPE: Release
before_script:
- | # Find VS installation
$VS_PATH = (
& "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -property InstallationPath
| Select-String -Pattern $VS_VERSION
)[0]
- | # Find DevShell.dll
$VS_DEV_SHELL = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -path "$VS_PATH" -find "**\Tools\Microsoft.VisualStudio.DevShell.dll"
- Import-Module "$VS_DEV_SHELL"
- Enter-VsDevShell -InstallPath "$VS_PATH" -SkipAutomaticLocation -DevCmdArguments '/arch=x64 /host_arch=x64 /no_logo'
script:
- !reference [".test:windows-vs", script]
- |-
$SkippedExamples = @(
"hip_vulkan_interop_vs2019.exe" # Graphical
"hip_opengl_interop_vs2019.exe" # Graphical
)
Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
ForEach-Object {
if ($SkippedExamples -NotContains $_.Name) {
echo "--" $_.Name
& "$CI_PROJECT_DIR/$BUILD_TYPE/$_"
if (!$?) {
throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
}
} else {
echo "-- SKIPPING " $_.Name
}
- | # Ensure no libraries are missing during compilation!
if (Select-String -Path cmake_log.txt -Pattern "could not find") {
throw "Some cmake libraries are missing"
}
- cmake --build "$CI_PROJECT_DIR/build"
# CMake does not copy the dependencies to the test folder, and there is no sufficiently concise way of doing it.
# So for now, just add the library path here.
- $env:PATH = "${env:HIP_PATH}\bin;" + $env:PATH
- cd "$CI_PROJECT_DIR/build"
- ctest --output-on-failure --timeout 15 -E "rocsparse_bsrsv|rocsparse_csrsv|rocsparse_spsv|rocsparse_bsrsm|rocsparse_csrsm|rocsparse_bsric0|rocsparse_bsrilu0|rocsparse_csric0|rocsparse_csrilu0"
- cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
needs: []
test:nvcc-windows-vs2017:
extends:
- .test:nvcc-windows
- .test:windows-vs
variables:
MSBUILD: "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/MSBuild/15.0/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-Portable-VS2017.sln"
# See https://developercommunity.visualstudio.com/t/windowstargetplatformversion-makes-it-impossible-t/140294
MSBUILD_EXTRA_OPTIONS: "/p:WindowsTargetPlatformVersion=10.0.20348.0"
test:nvcc-windows-vs2022:
test:windows-rocm-cmake:
extends:
- .test:nvcc-windows
- .test:windows-vs
variables:
MSBUILD: "C:/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe"
SOLUTION: "ROCm-Examples-Portable-VS2022.sln"
- .test:windows-rocm
- .test:windows-cmake
script:
- cmake
-S "$CI_PROJECT_DIR"
-B "$CI_PROJECT_DIR/build"
-G Ninja
-D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_HIP_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe"
-D CMAKE_HIP_LINK_EXECUTABLE:PATH="${env:HIP_PATH}\bin\lld-link.exe"
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}"
-D CMAKE_HIP_FLAGS="$HIP_FLAGS"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_HIP_ARCHITECTURES=gfx1030
-D CMAKE_TOOLCHAIN_FILE:PATH="C:\Tools\Microsoft\vcpkg\scripts\buildsystems\vcpkg.cmake"
2>&1 | Tee-Object -filepath cmake_log.txt
- !reference [.test:windows-cmake, script]
test:nvcc-windows-cmake:
test:windows-nvcc-cmake:
extends:
- .test:nvcc-windows
- .test:windows-nvcc
- .test:windows-cmake
script:
# Import the VisualStudio 2022 development environment
- |-
$vs = &"C:/Program Files (x86)/Microsoft Visual Studio/Installer/vswhere.exe" -version 17.0 -property InstallationPath
Import-Module (Join-Path $vs "Common7/Tools/Microsoft.VisualStudio.DevShell.dll")
Enter-VsDevShell -VsInstallPath $vs -SkipAutomaticLocation -DevCmdArguments "/arch=x64 /host_arch=x64 /no_logo"
# Note: The current version of the HIP SDK does not ship with CMake config files for Nvidia, so we can only test
# the HIP-Basic and Applications examples. It is expected that some dependencies will not be found for this.
- cmake
-S "$CI_PROJECT_DIR"
-B "$CI_PROJECT_DIR/build"
-G Ninja
-D CMAKE_CXX_COMPILER="cl.exe"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_TOOLCHAIN_FILE="C:/Tools/Microsoft/vcpkg/scripts/buildsystems/vcpkg.cmake"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_CXX_COMPILER="cl.exe"
-D GPU_RUNTIME=CUDA
2>&1 | Tee-Object -filepath cmake_log.txt
- |-
if (Select-String -Path cmake_log.txt -Pattern "could not find") {
throw "Some cmake libraries are missing"
}
- cmake --build "$CI_PROJECT_DIR/build"
- cd "$CI_PROJECT_DIR/build"
- ctest --output-on-failure --timeout 10
- cmake --install "$CI_PROJECT_DIR/build" --prefix "$CI_PROJECT_DIR/install"
- !reference [.test:windows-cmake, script]

20
.gitlab/issue_templates/example.md

@ -1,22 +1,12 @@ @@ -1,22 +1,12 @@
# Example checklist
- Elaboration
- [ ] Example concept is described and agreed on
- [ ] Example concept is described and agreed upon
- Implementation
- [ ] Example is implemented
- CMake support is added
- [ ] Linux
- [ ] Windows
- [ ] GNU Make support is added (Linux)
- [ ] Visual Studio project is added (Windows)
- [ ] Project is added to the root solution
- [ ] Inline code documentation is added
- [ ] README is added according to template
- [ ] Related READMEs, ToC are updated
- [ ] Internal CI passes
- [ ] Example is implemented
- Internal review
- [ ] Internal code review is done
- [ ] Internal code review is done
- External review
- [ ] Upstreaming PR is opened, external code review is done
- [ ] Upstreaming PR is opened, external review is done
- Done
- [ ] Example merged to upstream
- [ ] Example merged to upstream

16
.gitlab/merge_request_templates/example.md

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
## Notes for the reviewer
_The reviewer should acknowledge all these topics._
<insert notes>
## Checklist before merge
- [ ] CMake support is added
- [ ] Dependencies are copied via `IMPORTED_RUNTIME_ARTIFACTS` if applicable
- [ ] GNU Make support is added (Linux)
- [ ] Visual Studio project is added for VS2017, 2019, 2022 (Windows) (use [the script](https://projects.streamhpc.com/departments/knowledge/employee-handbook/-/wikis/Projects/AMD/Libraries/examples/Adding-Visual-Studio-Projects-to-new-examples#scripts))
- [ ] DLL dependencies are copied via `<Content Include`
- [ ] Visual Studio project is added to `ROCm-Examples-vs*.sln` (ROCm)
- [ ] Visual Studio project is added to `ROCm-Examples-Portable-vs*.sln` (ROCm/CUDA) if applicable
- [ ] Inline code documentation is added
- [ ] README is added according to template
- [ ] Related READMEs, ToC are updated
- [ ] The CI passes for Linux/ROCm, Linux/CUDA, Windows/ROCm, Windows/CUDA.

4
Applications/floyd_warshall/main.hip

@ -198,8 +198,8 @@ int main(int argc, char* argv[]) @@ -198,8 +198,8 @@ int main(int argc, char* argv[])
// Allocate device memory
unsigned int* d_adjacency_matrix;
unsigned int* d_next_matrix;
HIP_CHECK(hipMalloc((void**)&d_adjacency_matrix, size_bytes));
HIP_CHECK(hipMalloc((void**)&d_next_matrix, size_bytes));
HIP_CHECK(hipMalloc(&d_adjacency_matrix, size_bytes));
HIP_CHECK(hipMalloc(&d_next_matrix, size_bytes));
// Create events to measure the execution time of the kernels.
hipEvent_t start, stop;

4
CMakeLists.txt

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@ -20,7 +20,7 @@ @@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
cmake_minimum_required(VERSION 3.21.3 FATAL_ERROR)
project(ROCMm-SDK-Examples LANGUAGES CXX)
enable_testing()

14
Common/example_utils.hpp

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -23,6 +23,18 @@ @@ -23,6 +23,18 @@
#ifndef COMMON_EXAMPLE_UTILS_HPP
#define COMMON_EXAMPLE_UTILS_HPP
// Compiling HIP on Windows includes windows.h, and this triggers many silly warnings.
#if defined(_WIN32) && defined(__NVCC__)
#pragma nv_diag_suppress 108 // signed bit field of length 1
#pragma nv_diag_suppress 174 // expression has no effect
#pragma nv_diag_suppress 1835 // attribute "dllimport" does not apply here
#endif
// rocPRIM adds a #warning about printf on NAVI.
#ifdef __clang__
#pragma clang diagnostic ignored "-W#warnings"
#endif
#include <cassert>
#include <chrono>
#include <iostream>

75
Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile

@ -27,7 +27,7 @@ RUN export DEBIAN_FRONTEND=noninteractive; \ @@ -27,7 +27,7 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
# Install HIP using the installer script
RUN export DEBIAN_FRONTEND=noninteractive; \
wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
&& echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
&& echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.4/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
&& apt-get update -qq \
&& apt-get install -y hip-base hipify-clang \
&& apt-get download hip-runtime-nvidia hip-dev \
@ -45,64 +45,71 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \ @@ -45,64 +45,71 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
&& ldconfig
# Install rocRAND
RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.3.0.tar.gz \
&& tar -xf ./rocm-5.3.0.tar.gz \
&& rm ./rocm-5.3.0.tar.gz \
&& cmake -S ./rocRAND-rocm-5.3.0 -B ./rocRAND-rocm-5.3.0/build \
RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
&& tar -xf ./rocm-5.4.0.tar.gz \
&& rm ./rocm-5.4.0.tar.gz \
&& cmake -S ./rocRAND-rocm-5.4.0 -B ./rocRAND-rocm-5.4.0/build \
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-D BUILD_HIPRAND=OFF \
-D CMAKE_INSTALL_PREFIX=/opt/rocm \
&& cmake --build ./rocRAND-rocm-5.3.0/build --target install \
&& rm -rf ./rocRAND-rocm-5.3.0
&& cmake --build ./rocRAND-rocm-5.4.0/build --target install \
&& rm -rf ./rocRAND-rocm-5.4.0
# Install hipCUB
RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.3.0.tar.gz \
&& tar -xf ./rocm-5.3.0.tar.gz \
&& rm ./rocm-5.3.0.tar.gz \
&& cmake -S ./hipCUB-rocm-5.3.0 -B ./hipCUB-rocm-5.3.0/build \
RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.4.0.tar.gz \
&& tar -xf ./rocm-5.4.0.tar.gz \
&& rm ./rocm-5.4.0.tar.gz \
&& cmake -S ./hipCUB-rocm-5.4.0 -B ./hipCUB-rocm-5.4.0/build \
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-D CMAKE_INSTALL_PREFIX=/opt/rocm \
&& cmake --build ./hipCUB-rocm-5.3.0/build --target install \
&& rm -rf ./hipCUB-rocm-5.3.0
&& cmake --build ./hipCUB-rocm-5.4.0/build --target install \
&& rm -rf ./hipCUB-rocm-5.4.0
# Install hipBLAS
RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.3.0.tar.gz \
&& tar -xf ./rocm-5.3.0.tar.gz \
&& rm ./rocm-5.3.0.tar.gz \
&& cmake -S ./hipBLAS-rocm-5.3.0 -B ./hipBLAS-rocm-5.3.0/build \
RUN wget https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/refs/tags/rocm-5.4.0.tar.gz \
&& tar -xf ./rocm-5.4.0.tar.gz \
&& rm ./rocm-5.4.0.tar.gz \
&& cmake -S ./hipBLAS-rocm-5.4.0 -B ./hipBLAS-rocm-5.4.0/build \
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-D CMAKE_INSTALL_PREFIX=/opt/rocm \
-D USE_CUDA=ON \
&& cmake --build ./hipBLAS-rocm-5.3.0/build --target install \
&& rm -rf ./hipBLAS-rocm-5.3.0
&& cmake --build ./hipBLAS-rocm-5.4.0/build --target install \
&& rm -rf ./hipBLAS-rocm-5.4.0
# Install hipSOLVER
RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.3.0.tar.gz \
&& tar -xf ./rocm-5.3.0.tar.gz \
&& rm ./rocm-5.3.0.tar.gz \
&& cmake -S ./hipSOLVER-rocm-5.3.0 -B ./hipSOLVER-rocm-5.3.0/build \
RUN wget https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/refs/tags/rocm-5.4.0.tar.gz \
&& tar -xf ./rocm-5.4.0.tar.gz \
&& rm ./rocm-5.4.0.tar.gz \
&& cmake -S ./hipSOLVER-rocm-5.4.0 -B ./hipSOLVER-rocm-5.4.0/build \
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-D CMAKE_INSTALL_PREFIX=/opt/rocm \
-D USE_CUDA=ON \
&& cmake --build ./hipSOLVER-rocm-5.3.0/build --target install \
&& rm -rf ./hipSOLVER-rocm-5.3.0
&& cmake --build ./hipSOLVER-rocm-5.4.0/build --target install \
&& rm -rf ./hipSOLVER-rocm-5.4.0
# Install hipRAND
RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.3.0.tar.gz \
&& tar -xf ./rocm-5.3.0.tar.gz \
&& rm ./rocm-5.3.0.tar.gz \
&& cmake -S ./hipRAND-rocm-5.3.0 -B ./hipRAND-rocm-5.3.0/build \
RUN wget https://github.com/ROCmSoftwarePlatform/hipRAND/archive/refs/tags/rocm-5.4.0.tar.gz \
&& tar -xf ./rocm-5.4.0.tar.gz \
&& rm ./rocm-5.4.0.tar.gz \
&& cmake -S ./hipRAND-rocm-5.4.0 -B ./hipRAND-rocm-5.4.0/build \
-D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-D CMAKE_INSTALL_PREFIX=/opt/rocm \
&& cmake --build ./hipRAND-rocm-5.3.0/build --target install \
&& rm -rf ./hipRAND-rocm-5.3.0
-D BUILD_WITH_LIB=CUDA \
&& cmake --build ./hipRAND-rocm-5.4.0/build --target install \
&& rm -rf ./hipRAND-rocm-5.4.0
# Use render group as an argument from user
ARG GID=109
# Add the render group and a user with sudo permissions for the container
RUN groupadd --system --gid ${GID} render \
&& useradd -Um -G sudo,video,render developer \
# Add the render group or change id if already exists
RUN if [ $(getent group render) ]; then \
groupmod --gid ${GID} render; \
else \
groupadd --system --gid ${GID} render; \
fi
# Add a user with sudo permissions for the container
RUN useradd -Um -G sudo,video,render developer \
&& echo developer ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/developer \
&& chmod 0440 /etc/sudoers.d/developer

5
HIP-Basic/cooperative_groups/CMakeLists.txt

@ -54,6 +54,11 @@ add_test(${example_name} ${example_name}) @@ -54,6 +54,11 @@ add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
else()
# Add NDEBUG for HIP version >= 5.5 and < 6.0 due to a known bug in the cooperative groups header
if( ${hip-lang_VERSION} VERSION_GREATER_EQUAL 5.5 AND ${hip-lang_VERSION} VERSION_LESS 6 )
add_compile_definitions(NDEBUG)
endif()
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})

2
HIP-Basic/device_globals/CMakeLists.txt

@ -20,7 +20,7 @@ @@ -20,7 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name device_globals)
set(example_name hip_device_globals)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)

6
HIP-Basic/device_query/main.cpp

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,13 +20,13 @@ @@ -20,13 +20,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iomanip>
#include <iostream>
#include <hip/hip_runtime.h>
#include "example_utils.hpp"
namespace
{
/// Number of characters in the first column.

4
HIP-Basic/occupancy/main.hip

@ -165,8 +165,8 @@ int main() @@ -165,8 +165,8 @@ int main()
// Initialize the input data
for(int i = 0; i < size; i++)
{
h_A[i] = (float)i;
h_B[i] = (float)i;
h_A[i] = static_cast<float>(i);
h_B[i] = static_cast<float>(i);
}
float* d_A = nullptr;

5
HIP-Basic/texture_management/main.hip

@ -24,6 +24,7 @@ @@ -24,6 +24,7 @@
#include <hip/hip_runtime.h>
#include <array>
#include <iostream>
#include <vector>
@ -147,8 +148,8 @@ int main() @@ -147,8 +148,8 @@ int main()
HIP_CHECK(hipGetLastError());
// Copy data from device back to host.
unsigned int h_histogram[hist_bin_count];
HIP_CHECK(hipMemcpy(h_histogram, d_histogram, hist_bytes, hipMemcpyDeviceToHost));
std::array<unsigned int, hist_bin_count> h_histogram;
HIP_CHECK(hipMemcpy(h_histogram.data(), d_histogram, hist_bytes, hipMemcpyDeviceToHost));
// Print out results.
std::cout << "Equal-width histogram with " << hist_bin_count << " bins of values [0, " << size

35
Libraries/hipBLAS/gemm_strided_batched/README.md

@ -3,20 +3,21 @@ @@ -3,20 +3,21 @@
## Description
This example illustrates the use of the hipBLAS Level 3 Strided Batched General Matrix Multiplication. The hipBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as:
$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$
$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$
for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following:
- $f(X) = X$ or
- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following:
- $X' = X$ or
- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
In this example the identity is used.
$\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that
$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix.
$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix.
### Application flow
1. Read in command-line parameters.
2. Set $f$ operation, set sizes of matrices and get batch count.
2. Set dimension variables of the matrices and get the batch count.
3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
4. Initialize gold standard matrix.
5. Compute CPU reference result with strided batched subvectors.
@ -33,19 +34,19 @@ The application provides the following optional command line arguments: @@ -33,19 +34,19 @@ The application provides the following optional command line arguments:
- `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
- `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
- `-c` or `--count`. Batch count. Its default value is 3.
- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5.
- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5.
## Key APIs and Concepts
- The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the hipBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.<br/>
We can apply the same multiplication operator for several matrices if we combine them into batched matrices. Batched matrix multiplication has a performance improvement for a large number of small matrices. For a constant stride between matrices, further acceleration is available by strided batched GEMM.
- hipBLAS is initialized by calling `hipblasCreate(hipblasHandle*)` and it is terminated by calling `hipblasDestroy(hipblasHandle)`.
- The _pointer mode_ controls whether scalar parameters must be allocated on the host (`HIPBLAS_POINTER_MODE_HOST`) or on the device (`HIPBLAS_POINTER_MODE_DEVICE`). It is controlled by `hipblasSetPointerMode`.
- The $f$ operator -- defined in Description section -- can be
- `HIPBLAS_OP_N`: identity operator ($f(X) = X$),
- `HIPBLAS_OP_T`: transpose operator ($f(X) = X^T$) or
- `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($f(X) = X^H$).
- The symbol $X'$ denotes the following operations, as defined in the Description section:
- `HIPBLAS_OP_N`: identity operator ($X' = X$),
- `HIPBLAS_OP_T`: transpose operator ($X' = X^T$) or
- `HIPBLAS_OP_C`: Hermitian (conjugate transpose) operator ($X' = X^H$).
- `hipblasStride` strides between matrices or vectors in strided_batched functions.
- `hipblas[HSDCZ]gemmStridedBatched`
@ -60,9 +61,9 @@ We can apply the same multiplication operator for several matrices if we combine @@ -60,9 +61,9 @@ We can apply the same multiplication operator for several matrices if we combine
- `hipblasHandle_t handle`
- `hipblasOperation_t trans_a`: transformation operator on each $A_i$ matrix
- `hipblasOperation_t trans_b`: transformation operator on each $B_i$ matrix
- `int m`: number of rows in each $f(A_i)$ and $C$ matrices
- `int n`: number of columns in each $f(B_i)$ and $C$ matrices
- `int k`: number of columns in each $f(A_i)$ matrix and number of rows in each $f(B_i)$ matrix
- `int m`: number of rows in each $A_i'$ and $C$ matrices
- `int n`: number of columns in each $B_i'$ and $C$ matrices
- `int k`: number of columns in each $A_i'$ matrix and number of rows in each $B_i'$ matrix
- `const float *alpha`: scalar multiplier of each $C_i$ matrix addition
- `const float *A`: pointer to the each $A_i$ matrix
- `int lda`: leading dimension of each $A_i$ matrix

10
Libraries/hipBLAS/gemm_strided_batched/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -42,9 +42,9 @@ int main(const int argc, const char** argv) @@ -42,9 +42,9 @@ int main(const int argc, const char** argv)
parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
parser.set_optional<int>("c", "count", 3, "Batch count");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A_i) and C_i");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B_i) and C_i");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A_i and C_i");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B_i and C_i");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A_i and rows of B_i");
parser.run_and_exit_if_error();
// Set sizes of matrices.
@ -84,7 +84,7 @@ int main(const int argc, const char** argv) @@ -84,7 +84,7 @@ int main(const int argc, const char** argv)
const float h_alpha = parser.get<float>("a");
const float h_beta = parser.get<float>("b");
// Set GEMM operation as identity operation: $f(X) = X$
// Set GEMM operation as identity operation: $X' = X$
const hipblasOperation_t trans_a = HIPBLAS_OP_N;
const hipblasOperation_t trans_b = HIPBLAS_OP_N;

8
Libraries/hipCUB/device_radix_sort/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,6 +20,8 @@ @@ -20,6 +20,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <cassert>
#include <iostream>
#include <vector>
@ -27,12 +29,10 @@ @@ -27,12 +29,10 @@
#include <hip/hip_runtime.h>
#include <hipcub/device/device_radix_sort.hpp>
#include "example_utils.hpp"
int main()
{
// Allocate and initialize data on the host
const std::vector<float> h_keys{9.3, 2.1, 7.3, 4, 2.2, 5, 3.6, 2.7, 1.1, 0};
const std::vector<float> h_keys{9.3f, 2.1f, 7.3f, 4.0f, 2.2f, 5.0f, 3.6f, 2.7f, 1.1f, 0.0f};
const std::vector<int> h_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
assert(h_keys.size() == h_values.size());
const int num_elements = h_keys.size();

6
Libraries/hipCUB/device_sum/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,14 +20,14 @@ @@ -20,14 +20,14 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iostream>
#include <vector>
#include <hip/hip_runtime.h>
#include <hipcub/device/device_reduce.hpp>
#include "example_utils.hpp"
int main()
{
// Allocate and initialize data on the host

2
Libraries/hipSOLVER/syevj/main.cpp

@ -55,7 +55,7 @@ int main(const int argc, char* argv[]) @@ -55,7 +55,7 @@ int main(const int argc, char* argv[])
// 3. Generate a random symmetric matrix
std::default_random_engine generator;
std::uniform_real_distribution<double> distribution(0., 2.);
auto random_number = std::bind(distribution, generator);
auto random_number = [&]() { return distribution(generator); };
for(int i = 0; i < n; i++)
{

28
Libraries/rocBLAS/level_3/gemm/README.md

@ -2,17 +2,19 @@ @@ -2,17 +2,19 @@
## Description
This example illustrates the use of the rocBLAS Level 3 General Matrix Multiplication. The rocBLAS GEMM performs a matrix--matrix operation as:
$C = \alpha \cdot f(A) \cdot f(B) + \beta \cdot C$,
where $f(X)$ is one of the following:
- $f(X) = X$ or
- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $),
$C = \alpha \cdot A' \cdot B' + \beta \cdot C$,
where $X'$ is one of the following:
- $X' = X$ or
- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar{X_{ji}} $),
In this example the identity is used.
$\alpha and $\beta$ are scalars, and $A$, $B$ and $C$ are matrices, with
$f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times n$ matrix.
$A'$ an $m \times k$ matrix, $B'$ a $k \times n$ matrix and $C$ an $m \times n$ matrix.
### Application flow
1. Read in command-line parameters.
2. Set $f$ operation and set sizes of matrices.
2. Set dimension variables of the matrices.
3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
4. Initialize gold standard matrix.
5. Compute CPU reference result.
@ -28,9 +30,9 @@ $f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times @@ -28,9 +30,9 @@ $f(A)$ an $m \times k$ matrix, $f(B)$ a $k \times n$ matrix and $C$ an $m \times
The application provides the following optional command line arguments:
- `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
- `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
- `-m` or `--m`. The number of rows of matrices $f(A)$ and $C$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $f(B)$ and $C$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of matrix $f(A)$ and rows of matrix $f(B)$, which must be greater than 0. Its default value is 5.
- `-m` or `--m`. The number of rows of matrices $A$ and $C$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $B$ and $C$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of matrix $A$ and rows of matrix $B$, which must be greater than 0. Its default value is 5.
## Key APIs and Concepts
- rocBLAS is initialized by calling `rocblas_create_handle(rocblas_handle*)` and it is terminated by calling `rocblas_destroy_handle(rocblas_handle)`.
@ -47,9 +49,9 @@ The application provides the following optional command line arguments: @@ -47,9 +49,9 @@ The application provides the following optional command line arguments:
- `rocblas_handle handle`
- `rocblas_operation transA`: transformation operator on $A$ matrix
- `rocblas_operation transB`: transformation operator on $B$ matrix
- `rocblas_int m`: number of rows in $f(A)$ and $C$ matrices
- `rocblas_int n`: number of columns in $f(B)$ and $C$ matrices
- `rocblas_int k`: number of columns in $f(A)$ matrix and number of rows in $f(B)$ matrix
- `rocblas_int m`: number of rows in $A'$ and $C$ matrices
- `rocblas_int n`: number of columns in $B'$ and $C$ matrices
- `rocblas_int k`: number of columns in $A'$ matrix and number of rows in $B'$ matrix
- `const float *alpha`: scalar multiplier of $C$ matrix addition
- `const float *A`: pointer to the $A$ matrix
- `rocblas_int lda`: leading dimension of $A$ matrix

10
Libraries/rocBLAS/level_3/gemm/main.cpp

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -40,9 +40,9 @@ int main(const int argc, const char** argv) @@ -40,9 +40,9 @@ int main(const int argc, const char** argv)
cli::Parser parser(argc, argv);
parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A) and C");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B) and C");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A) and rows of f(B)");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A and C");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B and C");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A and rows of B");
parser.run_and_exit_if_error();
// Set sizes of matrices.
@ -73,7 +73,7 @@ int main(const int argc, const char** argv) @@ -73,7 +73,7 @@ int main(const int argc, const char** argv)
const rocblas_float h_alpha = parser.get<float>("a");
const rocblas_float h_beta = parser.get<float>("b");
// Set GEMM operation as identity operation: $f(X) = X$
// Set GEMM operation as identity operation: $X' = X$
const rocblas_operation trans_a = rocblas_operation_none;
const rocblas_operation trans_b = rocblas_operation_none;

27
Libraries/rocBLAS/level_3/gemm_strided_batched/README.md

@ -3,20 +3,21 @@ @@ -3,20 +3,21 @@
## Description
This example illustrates the use of the rocBLAS Level 3 Strided Batched General Matrix Multiplication. The rocBLAS GEMM STRIDED BATCHED performs a matrix--matrix operation for a _batch_ of matrices as:
$C[i] = \alpha \cdot f(A[i]) \cdot f(B[i]) + \beta \cdot (C[i])$
$C[i] = \alpha \cdot A[i]' \cdot B[i]' + \beta \cdot (C[i])$
for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $f(X)$ is one of the following:
- $f(X) = X$ or
- $f(X) = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $f(X) = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
for each $i \in [0, batch - 1]$, where $X[i] = X + i \cdot strideX$ is the $i$-th element of the correspondent batch and $X'$ is one of the following:
- $X' = X$ or
- $X' = X^T$ (transpose $X$: $X_{ij}^T = X_{ji}$) or
- $X' = X^H$ (Hermitian $X$: $X_{ij}^H = \bar X_{ji} $).
In this example the identity is used.
$\alpha$ and $\beta$ are scalars, and $A$, $B$ and $C$ are the batches of matrices. For each $i$, $A[i]$, $B[i]$ and $C[i]$ are matrices such that
$f(A[i])$ is an $m \times k$ matrix, $f(B[i])$ a $k \times n$ matrix and $C[i]$ an $m \times n$ matrix.
$A_i'$ is an $m \times k$ matrix, $B_i'$ a $k \times n$ matrix and $C_i$ an $m \times n$ matrix.
### Application flow
1. Read in command-line parameters.
2. Set $f$ operation, set sizes of matrices and get batch count.
2. Set dimension variables of the matrices and get batch count and stride.
3. Allocate and initialize the host matrices. Set up $B$ matrix as an identity matrix.
4. Initialize gold standard matrix.
5. Compute CPU reference result with strided batched subvectors.
@ -33,9 +34,9 @@ The application provides the following optional command line arguments: @@ -33,9 +34,9 @@ The application provides the following optional command line arguments:
- `-a` or `--alpha`. The scalar value $\alpha$ used in the GEMM operation. Its default value is 1.
- `-b` or `--beta`. The scalar value $\beta$ used in the GEMM operation. Its default value is 1.
- `-c` or `--count`. Batch count. Its default value is 3.
- `-m` or `--m`. The number of rows of matrices $f(A_i)$ and $C_i$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $f(B_i)$ and $C_i$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of columns of matrix f(A_i) and rows of f(B_i)
- `-m` or `--m`. The number of rows of matrices $A_i$ and $C_i$, which must be greater than 0. Its default value is 5.
- `-n` or `--n`. The number of columns of matrices $B_i$ and $C_i$, which must be greater than 0. Its default value is 5.
- `-k` or `--k`. The number of columns of columns of matrix $A_i$ and rows of $B_i$
## Key APIs and Concepts
- The performance of a numerical multi-linear algebra code can be heavily increased by using tensor contractions [ [Y. Shi et al., HiPC, pp 193, 2016.](https://doi.org/10.1109/HiPC.2016.031) ], thereby most of the rocBLAS functions have a`_batched` and a `_strided_batched` [ [C. Jhurani and P. Mullowney, JPDP Vol 75, pp 133, 2015.](https://doi.org/10.1016/j.jpdc.2014.09.003) ] extensions.<br/>
@ -57,9 +58,9 @@ We can apply the same multiplication operator for several matrices if we combine @@ -57,9 +58,9 @@ We can apply the same multiplication operator for several matrices if we combine
- `rocblas_handle handle`
- `rocblas_operation transA`: transformation operator on $A_i$ matrix
- `rocblas_operation transB`: transformation operator on $B_i$ matrix
- `rocblas_int m`: number of rows in $f(A_i)$ and $C_i$ matrices
- `rocblas_int n`: number of columns in $f(B_i)$ and $C_i$ matrices
- `rocblas_int k`: number of columns in $f(A_i)$ matrix and number of rows in $f(B_i)$ matrix
- `rocblas_int m`: number of rows in $A_i'$ and $C_i$ matrices
- `rocblas_int n`: number of columns in $B_i'$ and $C_i$ matrices
- `rocblas_int k`: number of columns in $A_i'$ matrix and number of rows in $B_i'$ matrix
- `const float *alpha`: scalar multiplier of $C_i$ matrix addition
- `const float *A`: pointer to each $A_i$ matrix
- `rocblas_int lda`: leading dimension of each $A_i$ matrix

10
Libraries/rocBLAS/level_3/gemm_strided_batched/main.cpp

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -42,9 +42,9 @@ int main(const int argc, const char** argv) @@ -42,9 +42,9 @@ int main(const int argc, const char** argv)
parser.set_optional<float>("a", "alpha", 1.f, "Alpha scalar");
parser.set_optional<float>("b", "beta", 1.f, "Beta scalar");
parser.set_optional<int>("c", "count", 3, "Batch count");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices f(A_i) and C_i");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices f(B_i) and C_i");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix f(A_i) and rows of f(B_i)");
parser.set_optional<int>("m", "m", 5, "Number of rows of matrices A_i and C_i");
parser.set_optional<int>("n", "n", 5, "Number of columns of matrices B_i and C_i");
parser.set_optional<int>("k", "k", 5, "Number of columns of matrix A_i and rows of B_i");
parser.run_and_exit_if_error();
// Set sizes of matrices.
@ -84,7 +84,7 @@ int main(const int argc, const char** argv) @@ -84,7 +84,7 @@ int main(const int argc, const char** argv)
const rocblas_float h_alpha = parser.get<float>("a");
const rocblas_float h_beta = parser.get<float>("b");
// Set GEMM operation as identity operation: $f(X) = X$.
// Set GEMM operation as identity operation: $X' = X$.
const rocblas_operation trans_a = rocblas_operation_none;
const rocblas_operation trans_b = rocblas_operation_none;

4
Libraries/rocPRIM/block_sum/main.hip

@ -20,6 +20,8 @@ @@ -20,6 +20,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iostream>
#include <numeric>
@ -28,8 +30,6 @@ @@ -28,8 +30,6 @@
#include <rocprim/block/block_load.hpp>
#include <rocprim/block/block_reduce.hpp>
#include "example_utils.hpp"
/// \brief Compute the sum of an array on the host CPU
std::vector<int> reduce_sum_host(const std::vector<int>& data,
const unsigned int run_size,

6
Libraries/rocPRIM/device_sum/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,14 +20,14 @@ @@ -20,14 +20,14 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iostream>
#include <vector>
#include <hip/hip_runtime.h>
#include <rocprim/device/device_reduce.hpp>
#include "example_utils.hpp"
int main()
{
// Allocate and initialize data on the host

6
Libraries/rocThrust/device_ptr/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,6 +20,8 @@ @@ -20,6 +20,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <cassert>
#include <iostream>
#include <sstream>
@ -33,8 +35,6 @@ @@ -33,8 +35,6 @@
#include <thrust/reduce.h>
#include <thrust/sequence.h>
#include "example_utils.hpp"
int main()
{
// Allocate memory buffer to store 10 integers on the device

6
Libraries/rocThrust/norm/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,6 +20,8 @@ @@ -20,6 +20,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <cmath>
#include <cstddef>
#include <iostream>
@ -29,8 +31,6 @@ @@ -29,8 +31,6 @@
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include "example_utils.hpp"
// An anonymous namespace sets static linkage to its contents.
// This means that the contained function definitions will only be visible
// in the current compilation unit (i.e. cpp source file).

6
Libraries/rocThrust/reduce_sum/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,6 +20,8 @@ @@ -20,6 +20,8 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <cstddef>
#include <iostream>
@ -27,8 +29,6 @@ @@ -27,8 +29,6 @@
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include "example_utils.hpp"
int main()
{
// create a host vector with 4 elements

6
Libraries/rocThrust/remove_points/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,13 +20,13 @@ @@ -20,13 +20,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/remove.h>
#include "example_utils.hpp"
// An anonymous namespace sets static linkage to its contents.
// This means that the contained function definitions will only be visible
// in the current compilation unit (i.e. cpp source file).

6
Libraries/rocThrust/saxpy/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,14 +20,14 @@ @@ -20,14 +20,14 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include "example_utils.hpp"
// This example illustrates how to implement the SAXPY operation
// (Y[i] = a * X[i] + Y[i]) using rocThrust.

6
Libraries/rocThrust/vectors/main.hip

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@ -20,13 +20,13 @@ @@ -20,13 +20,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "example_utils.hpp"
int main()
{
// Allocate a resizable vector in host memory.

108
Scripts/WindowsRunner.ps1

@ -0,0 +1,108 @@ @@ -0,0 +1,108 @@
param(
[Parameter(Mandatory)]
[string]$Path = "Debug",
[string]$Filter = "*.exe",
[int]$Timeout = 10,
[string[]]$Skip = @()
)
$Skip = $Skip | ForEach-Object { $_.Trim() }
Write-Host "Testing all '$Filter' in '$Path' with a timeout of $Timeout"
Write-Host "Skipping examples that match any of:"
foreach($item in $Skip) {
Write-Host "- $item"
}
$FailureCount = 0
$Results = @()
function Run-Example {
param(
[System.IO.FileInfo]$FileInfo
)
$Job = Start-Job -ScriptBlock {
param([string]$FullName)
$Time = Measure-Command {
try {
$Log = & $FullName
$JobExitStatus = $LASTEXITCODE
} catch {
$JobExitStatus = "CRASH!"
}
}
return [PSCustomObject]@{
ExitStatus = $JobExitStatus
Log = $Log
Time = $Time
}
} -ArgumentList $FileInfo.FullName
# Execute the job with a timeout
$Job | Wait-Job -TimeOut $Timeout | Out-Null
# Get the results from the job!
$Result = Receive-Job $Job
Write-Host $Result.Log
if ($null -ne $Result.ExitStatus) {
$TimeSpan = $Result.Time.toString("mm\:ss\.fff")
$ExitStatus = $Result.ExitStatus
} else {
$ExitStatus = "Timeout!"
$TimeSpan = $null
}
if ($Result.ExitStatus -eq 0) {
# Exited gracefully!
$Status = "`e[32mPass`e[0m"
$ExitDisplay = "`e[32m$ExitStatus`e[0m"
} else {
$ExitDisplay = "`e[31m$ExitStatus`e[0m"
# Otherwise, fail!
$Status = "`e[31m`e[1mFail`e[0m"
$FailureCount += 1
}
# Clean up!
Remove-Job -force $Job
[PSCustomObject]@{
Name = $FileInfo.Name
State = $Status
ExitStatus = $ExitDisplay
Time = $TimeSpan
}
}
Get-ChildItem -Recurse -File -Path $Path -Filter $Filter | ForEach-Object {
Write-Host ("`e[36m-- {0}`e[0m" -f $_.Name)
$ShouldSkip = $false
foreach($F in $Skip) {
if ($_.Name -like $F) {
Write-Host "`e[33m`e[1mSkipped by wildcard:`e[0m $F"
$ShouldSkip = $true
break
}
}
# Put into a hash table and append to a list for table magic!
if (-not $ShouldSkip) {
$Results += Run-Example $_
} else {
$Results += [PSCustomObject]@{
Name = $_.Name
State = "`e[33m`e[1mSkip`e[0m"
ExitStatus = $null
Time = $null
}
}
}
$Results | Format-Table
if ($FailureCount -gt 0) {
throw "$FailureCount failed jobs!"
}
Loading…
Cancel
Save