Browse Source

HIP "Basic" Example Suite (part 4) (#13)

* add cmake build instructions for windows

* Add bit extract example

* Resolve "Multi GPU data transfer example"

* Resolve "Vulkan interop example"

* fix bit extract typo

* Resolve "OpenGL interop example"

* Add module API example

* enable -Wall -Wextra -Werror in cmake in ci

* Resolve "CMake don't throw error when building examples and certain libraries are missing."

* Device Globals example

* add moving average example

* Resolve "Static library example"

* fix missing opengl cmake check

* Resolve "Inline assembly / GPU arch example"

* revert to old msvc project file structure

* add hip basic texture management example

* normalize line endings

* remove hiprtc from bitextract

* Resolve "Cooperative groups example"

* Fix GUIDs

* Resolve "Floyd-Warshall example"

Co-authored-by: Nol Moonen <nol@streamhpc.com>
Co-authored-by: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Co-authored-by: Robin Voetter <robin@streamhpc.com>
Co-authored-by: Vince van Heertum <vince@streamhpc.com>
pull/16/head
Mátyás Aradi 3 years ago committed by GitHub
parent
commit
acdf61bdb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      .gitattributes
  2. 49
      .gitlab-ci.yml
  3. 26
      Applications/CMakeLists.txt
  4. 34
      Applications/Makefile
  5. 43
      Applications/README.md
  6. 1
      Applications/floyd_warshall/.gitignore
  7. 58
      Applications/floyd_warshall/CMakeLists.txt
  8. 60
      Applications/floyd_warshall/Makefile
  9. 60
      Applications/floyd_warshall/README.md
  10. 25
      Applications/floyd_warshall/floyd_warshall_vs2019.sln
  11. 104
      Applications/floyd_warshall/floyd_warshall_vs2019.vcxproj
  12. 30
      Applications/floyd_warshall/floyd_warshall_vs2019.vcxproj.filters
  13. 281
      Applications/floyd_warshall/main.hip
  14. 3
      CMakeLists.txt
  15. 5
      Common/cmdparser.hpp
  16. 33
      Common/example_utils.hpp
  17. 311
      External/KHR/khrplatform.h
  18. 1947
      External/glad/glad.cpp
  19. 3649
      External/glad/glad.h
  20. 41
      HIP-Basic/CMakeLists.txt
  21. 19
      HIP-Basic/Makefile
  22. 45
      HIP-Basic/README.md
  23. 17
      HIP-Basic/assembly_to_executable/Makefile
  24. 50
      HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln
  25. 366
      HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj
  26. 106
      HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters
  27. 2
      HIP-Basic/assembly_to_executable/main.hip
  28. 78
      HIP-Basic/assembly_to_executable/main_gfx1030.s
  29. 61
      HIP-Basic/assembly_to_executable/main_gfx803.s
  30. 61
      HIP-Basic/assembly_to_executable/main_gfx900.s
  31. 61
      HIP-Basic/assembly_to_executable/main_gfx906.s
  32. 64
      HIP-Basic/assembly_to_executable/main_gfx908.s
  33. 64
      HIP-Basic/assembly_to_executable/main_gfx90a.s
  34. 26
      HIP-Basic/bandwidth/Makefile
  35. 50
      HIP-Basic/bandwidth/bandwidth_vs2019.sln
  36. 204
      HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj
  37. 58
      HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters
  38. 1
      HIP-Basic/bit_extract/.gitignore
  39. 57
      HIP-Basic/bit_extract/CMakeLists.txt
  40. 60
      HIP-Basic/bit_extract/Makefile
  41. 33
      HIP-Basic/bit_extract/README.md
  42. 25
      HIP-Basic/bit_extract/bit_extract_vs2019.sln
  43. 99
      HIP-Basic/bit_extract/bit_extract_vs2019.vcxproj
  44. 27
      HIP-Basic/bit_extract/bit_extract_vs2019.vcxproj.filters
  45. 111
      HIP-Basic/bit_extract/main.hip
  46. 1
      HIP-Basic/cooperative_groups/.gitignore
  47. 57
      HIP-Basic/cooperative_groups/CMakeLists.txt
  48. 60
      HIP-Basic/cooperative_groups/Makefile
  49. 41
      HIP-Basic/cooperative_groups/README.md
  50. 25
      HIP-Basic/cooperative_groups/cooperative_groups_vs2019.sln
  51. 99
      HIP-Basic/cooperative_groups/cooperative_groups_vs2019.vcxproj
  52. 27
      HIP-Basic/cooperative_groups/cooperative_groups_vs2019.vcxproj.filters
  53. 249
      HIP-Basic/cooperative_groups/main.hip
  54. 1
      HIP-Basic/device_globals/.gitignore
  55. 57
      HIP-Basic/device_globals/CMakeLists.txt
  56. 60
      HIP-Basic/device_globals/Makefile
  57. 45
      HIP-Basic/device_globals/README.md
  58. 25
      HIP-Basic/device_globals/device_globals_vs2019.sln
  59. 99
      HIP-Basic/device_globals/device_globals_vs2019.vcxproj
  60. 27
      HIP-Basic/device_globals/device_globals_vs2019.vcxproj.filters
  61. 164
      HIP-Basic/device_globals/main.hip
  62. 28
      HIP-Basic/device_query/Makefile
  63. 2
      HIP-Basic/device_query/device_query_vs2019.vcxproj
  64. 22
      HIP-Basic/dynamic_shared/Makefile
  65. 2
      HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj
  66. 22
      HIP-Basic/events/Makefile
  67. 4
      HIP-Basic/events/events_vs2019.vcxproj
  68. 1
      HIP-Basic/gpu_arch/.gitignore
  69. 57
      HIP-Basic/gpu_arch/CMakeLists.txt
  70. 60
      HIP-Basic/gpu_arch/Makefile
  71. 33
      HIP-Basic/gpu_arch/README.md
  72. 25
      HIP-Basic/gpu_arch/gpu_arch_vs2019.sln
  73. 99
      HIP-Basic/gpu_arch/gpu_arch_vs2019.vcxproj
  74. 27
      HIP-Basic/gpu_arch/gpu_arch_vs2019.vcxproj.filters
  75. 150
      HIP-Basic/gpu_arch/main.hip
  76. 22
      HIP-Basic/hello_world/Makefile
  77. 24
      HIP-Basic/hipify/Makefile
  78. 1
      HIP-Basic/inline_assembly/.gitignore
  79. 57
      HIP-Basic/inline_assembly/CMakeLists.txt
  80. 60
      HIP-Basic/inline_assembly/Makefile
  81. 48
      HIP-Basic/inline_assembly/README.md
  82. 25
      HIP-Basic/inline_assembly/inline_assembly_vs2019.sln
  83. 99
      HIP-Basic/inline_assembly/inline_assembly_vs2019.vcxproj
  84. 27
      HIP-Basic/inline_assembly/inline_assembly_vs2019.vcxproj.filters
  85. 145
      HIP-Basic/inline_assembly/main.hip
  86. 17
      HIP-Basic/llvm_ir_to_executable/Makefile
  87. 50
      HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln
  88. 366
      HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj
  89. 106
      HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters
  90. 2
      HIP-Basic/llvm_ir_to_executable/main.hip
  91. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll
  92. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx803.ll
  93. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx900.ll
  94. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx906.ll
  95. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx908.ll
  96. 16
      HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll
  97. 26
      HIP-Basic/matrix_multiplication/Makefile
  98. 202
      HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj
  99. 2
      HIP-Basic/module_api/.gitignore
  100. 76
      HIP-Basic/module_api/CMakeLists.txt
  101. Some files were not shown because too many files have changed in this diff Show More

3
.gitattributes vendored

@ -1 +1,4 @@ @@ -1 +1,4 @@
*.hip gitlab-language=cuda linguist-language=Cuda
*.sln text eol=crlf
*.vcxproj text eol=crlf
*.vcxproj.filters text eol=crlf

49
.gitlab-ci.yml

@ -29,6 +29,12 @@ include: @@ -29,6 +29,12 @@ include:
- /gpus-nvcc.yaml
- /rules.yaml
variables:
# suppressing 186 allows us to write `assert(a && "message")`.
CUDA_FLAGS: "-Xcompiler -Wall,-Wextra,-Werror --Werror all-warnings --diag-suppress 186"
CXX_FLAGS: "-Wall -Wextra -Werror"
HIP_FLAGS: "-Wall -Wextra -Werror"
stages:
- lint
- build
@ -98,7 +104,7 @@ build:make-rocm: @@ -98,7 +104,7 @@ build:make-rocm:
- rocm-build
needs: []
script:
- cd $CI_PROJECT_DIR && make -j $(nproc)
- cd $CI_PROJECT_DIR && make CXXFLAGS="$HIP_FLAGS" -j $(nproc)
build:make-cuda:
image: $DOCKER_TAG_PREFIX:cuda-ubuntu
@ -109,7 +115,7 @@ build:make-cuda: @@ -109,7 +115,7 @@ build:make-cuda:
- nvcc-build
needs: []
script:
- cd $CI_PROJECT_DIR && make GPU_RUNTIME=CUDA -j $(nproc)
- cd $CI_PROJECT_DIR && make CXXFLAGS="$CUDA_FLAGS" GPU_RUNTIME=CUDA -j $(nproc)
.build:cmake:
stage: build
@ -132,6 +138,15 @@ build:cmake-rocm: @@ -132,6 +138,15 @@ build:cmake-rocm:
-S $CI_PROJECT_DIR
-B $CI_PROJECT_DIR/build
-D CMAKE_HIP_ARCHITECTURES="$GPU_TARGETS"
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_HIP_FLAGS="$HIP_FLAGS"
| tee cmake_log.txt
# check if all dependencies were found
- |-
if grep -q "Could NOT find" cmake_log.txt; then
echo "Some CMake libraries could not be found"
exit 1
fi
- cmake --build $CI_PROJECT_DIR/build
build:cmake-cuda:
@ -145,6 +160,15 @@ build:cmake-cuda: @@ -145,6 +160,15 @@ build:cmake-cuda:
-S $CI_PROJECT_DIR
-B $CI_PROJECT_DIR/build
-D GPU_RUNTIME=CUDA
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_CUDA_FLAGS="$CUDA_FLAGS"
| tee cmake_log.txt
# check if all dependencies were found
- |-
if grep -q "Could NOT find" cmake_log.txt; then
echo "Some CMake libraries could not be found"
exit 1
fi
- cmake --build $CI_PROJECT_DIR/build
.test:
@ -190,16 +214,19 @@ test:rocm-windows-vs2019: @@ -190,16 +214,19 @@ test:rocm-windows-vs2019:
- >
& "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
/maxCpuCount
/warnAsError
"/p:Configuration=$BUILD_TYPE"
"$CI_PROJECT_DIR"
- |-
Get-ChildItem -Path "$CI_PROJECT_DIR/$BUILD_TYPE" -Filter "*_vs2019.exe" |
ForEach-Object {
echo "--" $_.Name
& "$_"
if (!$?) {
throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
}
if (("hip_vulkan_interop_vs2019.exe","hip_opengl_interop_vs2019.exe") -NotContains $_.Name) {
echo "--" $_.Name
& "$_"
if (!$?) {
throw "{0} returned: {1}" -f $_.Name, $LASTEXITCODE
}
}
}
test:rocm-windows-cmake:
@ -210,8 +237,16 @@ test:rocm-windows-cmake: @@ -210,8 +237,16 @@ test:rocm-windows-cmake:
-S "$CI_PROJECT_DIR"
-B "$CI_PROJECT_DIR/build"
-G Ninja
-D CMAKE_CXX_FLAGS="$CXX_FLAGS"
-D CMAKE_HIP_FLAGS="$HIP_FLAGS"
-D CMAKE_BUILD_TYPE="$BUILD_TYPE"
-D CMAKE_HIP_ARCHITECTURES=gfx1030
-D CMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/10/bin/10.0.19041.0/x64/rc.exe"
-D CMAKE_TOOLCHAIN_FILE="C:/Tools/Microsoft/vcpkg/scripts/buildsystems/vcpkg.cmake"
| Tee-Object -filepath cmake_log.txt
- |-
if (Select-String -Path cmake_log.txt -Pattern "Could NOT find") {
throw "Some cmake libraries are missing"
}
- cmake --build "$CI_PROJECT_DIR/build"
- cd "$CI_PROJECT_DIR/build" && ctest --output-on-failure

26
Applications/CMakeLists.txt

@ -0,0 +1,26 @@ @@ -0,0 +1,26 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(Applications LANGUAGES CXX)
add_subdirectory(floyd_warshall)

34
Applications/Makefile

@ -0,0 +1,34 @@ @@ -0,0 +1,34 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLES := \
floyd_warshall
all: $(EXAMPLES)
clean: TARGET=clean
clean: all
$(EXAMPLES):
$(MAKE) -C $@ $(TARGET)
.PHONY: all clean $(EXAMPLES)

43
Applications/README.md

@ -0,0 +1,43 @@ @@ -0,0 +1,43 @@
# Applications Examples
## Summary
The examples in this subdirectory showcase several GPU-implementations of finance, computer science, physics, etc. models or algorithms that additionally offer a command line application. The examples are build on Linux for the ROCm (AMD GPU) backend. Some examples additionally support the CUDA (NVIDIA GPU) backend.
## Prerequisites
### Linux
- [CMake](https://cmake.org/download/) (at least version 3.21)
- OR GNU Make - available via the distribution's package manager
- [ROCm](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.1.3/page/Overview_of_ROCm_Installation_Methods.html) (at least version 5.x.x)
### Windows
- [Visual Studio](https://visualstudio.microsoft.com/) 2019 or 2022 with the "Desktop Development with C++" workload
- ROCm toolchain for Windows (No public release yet)
- The Visual Studio ROCm extension needs to be installed to build with the solution files.
- [CMake](https://cmake.org/download/) (optional, to build with CMake. Requires at least version 3.21)
- [Ninja](https://ninja-build.org/) (optional, to build with CMake)
## Building
### Linux
Make sure that the dependencies are installed, or use one of the [provided Dockerfiles](../../Dockerfiles/) to build and run the examples in a containerized environment.
#### Using CMake
All examples in the `Applications` subdirectory can either be built by a single CMake project or be built independently.
- `$ cd Libraries/Applications`
- `$ cmake -S . -B build` (on ROCm) or `$ cmake -S . -B build -D GPU_RUNTIME=CUDA` (on CUDA, when supported)
- `$ cmake --build build`
#### Using Make
All examples can be built by a single invocation to Make or be built independently.
- `$ cd Libraries/Applications`
- `$ make` (on ROCm) or `$ make GPU_RUNTIME=CUDA` (on CUDA, when supported)
### Windows
#### Visual Studio
Visual Studio solution files are available for the individual examples. To build all supported HIP runtime examples open the top level solution file [ROCm-Examples-VS2019.sln](../../ROCm-Examples-VS2019.sln) and filter for Applications.
For more detailed build instructions refer to the top level [README.md](../../README.md#visual-studio).
#### CMake
All examples in the `Applications` subdirectory can either be built by a single CMake project or be built independently. For build instructions refer to the top-level [README.md](../../README.md#cmake-2).

1
Applications/floyd_warshall/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
applications_floyd_warshall

58
Applications/floyd_warshall/CMakeLists.txt

@ -0,0 +1,58 @@ @@ -0,0 +1,58 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name applications_floyd_warshall)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
# For examples targeting NVIDIA, include the HIP header directory.
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
Applications/floyd_warshall/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := applications_floyd_warshall
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

60
Applications/floyd_warshall/README.md

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# Applications Floyd-Warshall Example
## Description
This example showcases a GPU implementation of the [Floyd-Warshall algorithm](https://en.wikipedia.org/wiki/Floyd%E2%80%93Warshall_algorithm), which computes the shortest path between each pair of nodes in a given directed and (in this case) complete graph $G = (V, E, \omega)$. The key point of this implementation is that each kernel launch represents a step $k$ of the traditional CPU-implemented algorithm. Therefore, the kernel is launched as much times as nodes $\left(n = \vert V \vert \right)$ has the graph.
In this example, there are `iterations` (consecutive) executions of the algorithm on the same graph. As each execution requires an unmodified graph input, multiple copy operations are required. Hence, the performance of the example can be improved by using _pinned memory_.
Pinned memory is simply a special kind of memory that cannot be paged out the physical memory of a process, meaning that the virtual addresses associated with it are always mapped to physical memory. When copying data from/to the host to/from the GPU, the host source/destination must be pinned memory and, in case it is not, an extra allocation of pinned memory is first performed (copying the data residing in or being copied to the non-pinned host memory) and then the actual copy of the data takes place.
Therefore, using pinned memory saves around 2x the time needed to copy from/to host memory. In this example, performances is improved by using this type of memory, given that there are `iterations` (consecutive) executions of the algorithm on the same graph.
### Application flow
1. Default values for the number of nodes of the graph and the number of iterations for the algorithm execution are set.
2. Command line arguments are parsed (if any) and the previous values are updated.
3. A number of constants are defined for kernel execution and input/output data size.
4. Host memory is allocated for the distance matrix and initialized with the increasing sequence $1,2,3,\dots$ . These values represent the weights of the edges of the graph.
5. Host memory is allocated for the adjacency matrix and initialized such that the initial path between each pair of vertices $x,y \in V$ ($x \neq y$) is the edge $(x,y)$.
6. Pinned memory is allocated and mapped to device memory. The latter is initialized with the input matrices (distance and adjacency) representing the graph $G$ and the Floyd-Warshall kernel is executed for each node of the graph.
7. The resulting distance and adjacency matrices are copied to the host and pinned memory is freed.
8. The mean time in milliseconds needed for each iteration is printed to standard output.
9. The results obtained are compared with the CPU implementation of the algorithm. The result of the comparison is printed to the standard output.
### Command line interface
There are three parameters available:
- `-h` displays information about the available parameters and their default values.
- `-n nodes` sets `nodes` as the number of nodes of the graph to which the Floyd-Warshall algorithm will be applied. It must be a (positive) multiple of `block_size` (= 16). Its default value is 16.
- `-i iterations` sets `iterations` as the number of times that the algorithm will be applied to the (same) graph. It must be an integer greater than 0. Its default value is 1.
## Key APIs and Concepts
- For this GPU implementation of the Floyd-Warshall algorithm, the main kernel (`floyd_warshall_kernel`) that is launched in a 2-dimensional grid. Each thread in the grid computes the shortest path between two nodes of the graph at a certain step $k$ $\left(0 \leq k < n \right)$. The threads compare the previously computed shortest paths using only the nodes in $V'=\{v_0,v_1,...,v_{k-1}\} \subseteq V$ as intermediate nodes with the paths that include node $v_k$ as an intermediate node, and take the shortest option. Therefore, the kernel is launched $n$ times.
- For improved performance, pinned memory is used to pass the results obtained in each iteration to the next one. With `hipHostMalloc` pinned host memory (accessible by the device) can be allocated, and `hipHostFree` frees it. In this example, host pinned memory is allocated using the `hipHostMallocMapped` flag, which indicates that `hipHostMalloc` must map the allocation into the address space of the current device. The device pointer to such allocated pinned memory is obtained with `hipHostGetDevicePointer`. Beware that an excessive allocation of pinned memory can slow down the host execution, as the program is left with less physical memory available to map the rest of the virtual addresses used.
- With `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.
- `hipLaunchKernelGGL` queues the kernel execution on the device. All the kernels are launched on the `hipStreamDefault`, meaning that these executions are performed in order. `hipGetLastError` returns the last error produced by any runtime API call, allowing to check if any kernel launch resulted in error.
- `hipEventCreate` creates the events used to measure kernel execution time, `hipEventRecord` starts recording an event and `hipEventSynchronize` waits for all the previous work in the stream when the specified event was recorded. With these three functions it can be measured the start and stop times of the kernel, and with `hipEventElapsedTime` the kernel execution time (in milliseconds) can be obtained.
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `blockIdx`
- `blockDim`
- `threadIdx`
#### Host symbols
- `__global__`
- `hipEventCreate`
- `hipEventElapsedTime`
- `hipEventRecord`
- `hipEventSynchronize`
- `hipGetLastError`
- `hipHostFree`
- `hipHostGetDevicePointer`
- `hipHostMalloc`
- `hipHostMallocMapped`
- `hipLaunchKernelGGL`
- `hipMemcpy`
- `hipMemcpyDeviceToHost`
- `hipMemcpyHostToDevice`
- `hipStreamDefault`

25
Applications/floyd_warshall/floyd_warshall_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "floyd_warshall_vs2019", "floyd_warshall_vs2019.vcxproj", "{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}.Debug|x64.ActiveCfg = Debug|x64
{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}.Debug|x64.Build.0 = Debug|x64
{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}.Release|x64.ActiveCfg = Release|x64
{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0}
EndGlobalSection
EndGlobal

104
Applications/floyd_warshall/floyd_warshall_vs2019.vcxproj

@ -0,0 +1,104 @@ @@ -0,0 +1,104 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\cmdparser.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{FB6B7014-2BC9-475C-B3CC-FEE6B4C5B103}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>floyd_warshall_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>applications_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>applications_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

30
Applications/floyd_warshall/floyd_warshall_vs2019.vcxproj.filters

@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{2932a426-602b-4926-887e-27c50ba7eab7}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{ed043ec4-e8ac-4831-93f5-a58546ec7bea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{0da954bd-e555-4454-b082-b68d10c753b9}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\cmdparser.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

281
Applications/floyd_warshall/main.hip

@ -0,0 +1,281 @@ @@ -0,0 +1,281 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, includ_adjacency_matrixg without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUd_adjacency_matrixG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "cmdparser.hpp"
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <cassert>
#include <iostream>
#include <numeric>
#include <vector>
/// \brief Implements the k-th (0 <= k < nodes) step of Floyd-Warshall algorithm. That is,
/// given a directed and weighted graph G = (V,E,w) (also complete in this example), it
/// computes the shortest path between every pair of vertices only considering as intermediate
/// nodes in the path the ones in the subset V' = {v_0,v_1,...,v_k} of V.
__global__ void floyd_warshall_kernel(unsigned int* part_adjacency_matrix,
unsigned int* part_next_matrix,
const unsigned int nodes,
const unsigned int k)
{
// Compute the vertices which shortest path each thread is going to process.
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
// Get the current distance between the two vertices (only with intermediate nodes in
// {v_0,v_1,...,v_{k-1}}) and compute the distance using node v_k as intermediate. Note that
// d_x_k_y is the shortest path between x and y with node v_k as intermediate, because
// otherwise we could find a shorter path between y and v_k or/and v_k and x using intermediate
// nodes from {v_0,v_1,...,v_{k-1}} and thus contradicting the fact that the current paths
// between those two pairs of nodes are already the shortest possible.
int d_x_y = part_adjacency_matrix[y * nodes + x];
int d_x_k_y = part_adjacency_matrix[y * nodes + k] + part_adjacency_matrix[k * nodes + x];
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one
// with intermediate node v_k, update matrices so the latter is selected as the
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}.
if(d_x_k_y < d_x_y)
{
part_adjacency_matrix[y * nodes + x] = d_x_k_y;
part_next_matrix[y * nodes + x] = k;
}
}
/// \brief Reference CPU implementation of Floyd-Warshall algorithm for results verification.
void floyd_warshall_reference(unsigned int* adjacency_matrix,
unsigned int* next_matrix,
const unsigned int nodes)
{
for(unsigned int k = 0; k < nodes; k++)
{
for(unsigned int x = 0; x < nodes; x++)
{
const unsigned int row_x = x * nodes;
for(unsigned int y = 0; y < nodes; y++)
{
// d_x_y is the shortest distance from node x to node y with intermediate
// nodes in {v_0, ..., v_{k-1}}. The other two are analogous.
const unsigned int d_x_y = adjacency_matrix[row_x + y];
const unsigned int d_x_k = adjacency_matrix[row_x + k];
const unsigned int d_k_y = adjacency_matrix[k * nodes + y];
// Shortest distance from node x to node y passing through node v_k.
const unsigned int d_x_k_y = d_x_k + d_k_y;
// If the path with intermediate nodes in {v_0, ..., v_{k-1}} is longer than the one
// with intermediate node v_k, update matrices so the latter is selected as the
// shortest path between x and y with intermediate nodes in {v_0, ..., v_k}.
if(d_x_k_y < d_x_y)
{
adjacency_matrix[row_x + y] = d_x_k_y;
next_matrix[row_x + y] = k;
}
}
}
}
}
/// \brief Adds to a command line parser the necessary options for this example.
template<unsigned int BlockSize>
void configure_parser(cli::Parser& parser)
{
// Default parameters.
constexpr unsigned int nodes = 16;
constexpr unsigned int iterations = 1;
static_assert(((nodes % BlockSize == 0)),
"Number of nodes must be a positive multiple of BlockSize");
static_assert(((iterations > 0)), "Number of iterations must be at least 1");
// Add options to the command line parser.
parser.set_optional<unsigned int>("n", "nodes", nodes, "Number of nodes in the graph.");
parser.set_optional<unsigned int>("i",
"iterations",
iterations,
"Number of times the algorithm is executed.");
}
int main(int argc, char* argv[])
{
// Number of threads in each kernel block dimension.
constexpr unsigned int block_size = 16;
// Parse user input.
cli::Parser parser(argc, argv);
configure_parser<block_size>(parser);
parser.run_and_exit_if_error();
// Get number of nodes and iterations from the command line, if provided.
const unsigned int nodes = parser.get<unsigned int>("n");
const unsigned int iterations = parser.get<unsigned int>("i");
// Check values provided.
if(nodes % block_size)
{
std::cout << "Number of nodes must be a positive multiple of block_size ("
<< std::to_string(block_size) << ")." << std::endl;
exit(0);
}
if(iterations == 0)
{
std::cout << "Number of iterations must be at least 1." << std::endl;
exit(0);
}
// Total number of elements and bytes of the input matrices.
const unsigned int size = nodes * nodes;
const unsigned int size_bytes = nodes * nodes * sizeof(unsigned int);
// Number of threads in each kernel block and number of blocks in the grid.
const dim3 block_dim(block_size, block_size);
const dim3 grid_dim(nodes / block_size, nodes / block_size);
// Allocate host input adjacency matrix initialized with the increasing sequence 1,2,3,... .
// Overwrite diagonal values (distance from a node to itself) to 0.
std::vector<unsigned int> adjacency_matrix(size);
std::iota(adjacency_matrix.begin(), adjacency_matrix.end(), 1);
for(unsigned int x = 0; x < nodes; x++)
{
adjacency_matrix[x * nodes + x] = 0;
}
// Allocate host input matrix for the reconstruction of the paths obtained and initialize such
// that the path from node x to node y is just the edge (x,y) for any pair of nodes x and y.
std::vector<unsigned int> next_matrix(size);
for(unsigned int x = 0; x < nodes; x++)
{
for(unsigned int y = 0; y < x; y++)
{
next_matrix[x * nodes + y] = x;
next_matrix[y * nodes + x] = y;
}
next_matrix[x * nodes + x] = x;
}
// Allocate host memory for the CPU implementation and copy input data.
std::vector<unsigned int> expected_adjacency_matrix(adjacency_matrix);
std::vector<unsigned int> expected_next_matrix(next_matrix);
// Declare host input (pinned) memory for incremental results from kernel executions.
unsigned int* part_adjacency_matrix = nullptr;
unsigned int* part_next_matrix = nullptr;
// Cumulative variable to compute the mean time per iteration of the algorithm.
double kernel_time = 0;
std::cout << "Executing Floyd-Warshall algorithm for " << iterations
<< " iterations with a complete graph of " << nodes << " nodes." << std::endl;
// Allocate pinned host memory mapped to device memory.
HIP_CHECK(hipHostMalloc(&part_adjacency_matrix, size_bytes, hipHostMallocMapped));
HIP_CHECK(hipHostMalloc(&part_next_matrix, size_bytes, hipHostMallocMapped));
// Get device pointer to pinned host memory allocations for the input matrices.
float *d_adjacency_matrix, *d_next_matrix;
HIP_CHECK(
hipHostGetDevicePointer((void**)&d_adjacency_matrix, part_adjacency_matrix, 0 /*flags*/));
HIP_CHECK(hipHostGetDevicePointer((void**)&d_next_matrix, part_next_matrix, 0 /*flags*/));
// Run iterations times the Floyd-Warshall GPU algorithm.
for(unsigned int i = 0; i < iterations; ++i)
{
// Copy input data from host to device memory.
HIP_CHECK(hipMemcpy(d_adjacency_matrix,
adjacency_matrix.data(),
size_bytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_next_matrix, next_matrix.data(), size_bytes, hipMemcpyHostToDevice));
// Create events to measure the execution time of the kernels.
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
float kernel_ms{};
// Floyd-Warshall GPU algorithm: launch Floyd-Warshall kernel for each node of the graph.
for(unsigned int k = 0; k < nodes; ++k)
{
// Record the start event.
HIP_CHECK(hipEventRecord(start, hipStreamDefault));
// Launch Floyd-Warshall kernel on the default stream.
hipLaunchKernelGGL(floyd_warshall_kernel,
grid_dim,
block_dim,
0,
hipStreamDefault,
part_adjacency_matrix,
part_next_matrix,
nodes,
k);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Record the stop event and wait until the kernel execution finishes.
HIP_CHECK(hipEventRecord(stop, hipStreamDefault));
HIP_CHECK(hipEventSynchronize(stop));
// Get the execution time of the kernel and add it to the total count.
HIP_CHECK(hipEventElapsedTime(&kernel_ms, start, stop));
kernel_time += kernel_ms;
}
}
// Copy results back to host.
HIP_CHECK(
hipMemcpy(adjacency_matrix.data(), d_adjacency_matrix, size_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(next_matrix.data(), d_next_matrix, size_bytes, hipMemcpyDeviceToHost));
// Free device memory.
HIP_CHECK(hipHostFree(part_adjacency_matrix));
HIP_CHECK(hipHostFree(part_next_matrix));
// Print the mean time per iteration (in miliseconds) of the algorithm.
kernel_time /= iterations;
std::cout << "The mean time needed for each iteration has been " << kernel_time << "ms."
<< std::endl;
// Execute CPU algorithm.
floyd_warshall_reference(expected_adjacency_matrix.data(), expected_next_matrix.data(), nodes);
// Verify results.
unsigned int errors = 0;
std::cout << "Validating results with CPU implementation." << std::endl;
for(unsigned int i = 0; i < size; ++i)
{
errors += (adjacency_matrix[i] - expected_adjacency_matrix[i] != 0);
errors += (next_matrix[i] - expected_next_matrix[i] != 0);
}
if(errors)
{
std::cout << "Validation failed with " << errors << " errors." << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}

3
CMakeLists.txt

@ -21,8 +21,9 @@ @@ -21,8 +21,9 @@
# SOFTWARE.
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(ROCMm-SDK-Examples)
project(ROCMm-SDK-Examples LANGUAGES CXX)
enable_testing()
add_subdirectory(Applications)
add_subdirectory(HIP-Basic)
add_subdirectory(Libraries)

5
Common/cmdparser.hpp

@ -433,11 +433,8 @@ public: @@ -433,11 +433,8 @@ public:
[this](CallbackArgs& args)
{
args.output << this->usage();
#pragma warning(push)
#pragma warning(disable : 4702)
exit(0);
return false;
#pragma warning(pop)
}),
"",
true);
@ -765,4 +762,4 @@ private: @@ -765,4 +762,4 @@ private:
std::vector<std::string> _arguments;
std::vector<CmdBase*> _commands;
};
} // namespace cli
} // namespace cli

33
Common/example_utils.hpp

@ -28,6 +28,7 @@ @@ -28,6 +28,7 @@
#include <iterator>
#include <sstream>
#include <string>
#include <type_traits>
#include <hip/hip_runtime.h>
@ -52,7 +53,7 @@ constexpr int error_exit_code = -1; @@ -52,7 +53,7 @@ constexpr int error_exit_code = -1;
/// must be dereferencable in host code. Its value type must be formattable to
/// \p std::ostream.
template<class BidirectionalIterator>
std::string format_range(const BidirectionalIterator begin, const BidirectionalIterator end)
inline std::string format_range(const BidirectionalIterator begin, const BidirectionalIterator end)
{
std::stringstream sstream;
sstream << "[ ";
@ -74,10 +75,10 @@ std::string format_range(const BidirectionalIterator begin, const BidirectionalI @@ -74,10 +75,10 @@ std::string format_range(const BidirectionalIterator begin, const BidirectionalI
/// \tparam BidirectionalIteratorU - must implement the BidirectionalIterator concept and
/// must be dereferencable in host code. Its value type must be formattable to \p std::ostream.
template<class BidirectionalIteratorT, typename BidirectionalIteratorU>
std::string format_pairs(const BidirectionalIteratorT begin_a,
const BidirectionalIteratorT end_a,
const BidirectionalIteratorU begin_b,
const BidirectionalIteratorU end_b)
inline std::string format_pairs(const BidirectionalIteratorT begin_a,
const BidirectionalIteratorT end_a,
const BidirectionalIteratorU begin_b,
const BidirectionalIteratorU end_b)
{
(void)end_b;
assert(std::distance(begin_a, end_a) == std::distance(begin_b, end_b));
@ -101,7 +102,7 @@ std::string format_pairs(const BidirectionalIteratorT begin_a, @@ -101,7 +102,7 @@ std::string format_pairs(const BidirectionalIteratorT begin_a,
/// \brief A function to parse a string for an int. If the string is a valid integer then return true
/// else if it has non-numeric character then return false.
bool parse_int_string(const std::string& str, int& out)
inline bool parse_int_string(const std::string& str, int& out)
{
try
{
@ -133,16 +134,17 @@ public: @@ -133,16 +134,17 @@ public:
this->reset_timer();
}
void reset_timer()
inline void reset_timer()
{
this->elapsed_time = std::chrono::steady_clock::duration(0);
}
void start_timer()
inline void start_timer()
{
this->start_time = std::chrono::steady_clock::now();
}
void stop_timer()
inline void stop_timer()
{
const auto end_time = std::chrono::steady_clock::now();
this->elapsed_time += end_time - this->start_time;
@ -150,10 +152,21 @@ public: @@ -150,10 +152,21 @@ public:
/// @brief Returns time elapsed in Seconds
/// @return type double that contains the elapsed time in Seconds
double get_elapsed_time() const
inline double get_elapsed_time() const
{
return std::chrono::duration_cast<std::chrono::duration<double>>(this->elapsed_time)
.count();
}
};
/// \brief Returns <tt>ceil(dividend / divisor)</tt>, where \p dividend is an integer and
/// \p divisor is an unsigned integer.
template<typename T,
typename U,
std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<U>::value, int> = 0>
__host__ __device__ auto ceiling_div(const T& dividend, const U& divisor)
{
return (dividend + divisor - 1) / divisor;
}
#endif // COMMON_EXAMPLE_UTILS_HPP

311
External/KHR/khrplatform.h vendored

@ -0,0 +1,311 @@ @@ -0,0 +1,311 @@
#ifndef __khrplatform_h_
#define __khrplatform_h_
/*
** Copyright (c) 2008-2018 The Khronos Group Inc.
**
** Permission is hereby granted, free of charge, to any person obtaining a
** copy of this software and/or associated documentation files (the
** "Materials"), to deal in the Materials without restriction, including
** without limitation the rights to use, copy, modify, merge, publish,
** distribute, sublicense, and/or sell copies of the Materials, and to
** permit persons to whom the Materials are furnished to do so, subject to
** the following conditions:
**
** The above copyright notice and this permission notice shall be included
** in all copies or substantial portions of the Materials.
**
** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
*/
/* Khronos platform-specific types and definitions.
*
* The master copy of khrplatform.h is maintained in the Khronos EGL
* Registry repository at https://github.com/KhronosGroup/EGL-Registry
* The last semantic modification to khrplatform.h was at commit ID:
* 67a3e0864c2d75ea5287b9f3d2eb74a745936692
*
* Adopters may modify this file to suit their platform. Adopters are
* encouraged to submit platform specific modifications to the Khronos
* group so that they can be included in future versions of this file.
* Please submit changes by filing pull requests or issues on
* the EGL Registry repository linked above.
*
*
* See the Implementer's Guidelines for information about where this file
* should be located on your system and for more details of its use:
* http://www.khronos.org/registry/implementers_guide.pdf
*
* This file should be included as
* #include <KHR/khrplatform.h>
* by Khronos client API header files that use its types and defines.
*
* The types in khrplatform.h should only be used to define API-specific types.
*
* Types defined in khrplatform.h:
* khronos_int8_t signed 8 bit
* khronos_uint8_t unsigned 8 bit
* khronos_int16_t signed 16 bit
* khronos_uint16_t unsigned 16 bit
* khronos_int32_t signed 32 bit
* khronos_uint32_t unsigned 32 bit
* khronos_int64_t signed 64 bit
* khronos_uint64_t unsigned 64 bit
* khronos_intptr_t signed same number of bits as a pointer
* khronos_uintptr_t unsigned same number of bits as a pointer
* khronos_ssize_t signed size
* khronos_usize_t unsigned size
* khronos_float_t signed 32 bit floating point
* khronos_time_ns_t unsigned 64 bit time in nanoseconds
* khronos_utime_nanoseconds_t unsigned time interval or absolute time in
* nanoseconds
* khronos_stime_nanoseconds_t signed time interval in nanoseconds
* khronos_boolean_enum_t enumerated boolean type. This should
* only be used as a base type when a client API's boolean type is
* an enum. Client APIs which use an integer or other type for
* booleans cannot use this as the base type for their boolean.
*
* Tokens defined in khrplatform.h:
*
* KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
*
* KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
* KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
*
* Calling convention macros defined in this file:
* KHRONOS_APICALL
* KHRONOS_APIENTRY
* KHRONOS_APIATTRIBUTES
*
* These may be used in function prototypes as:
*
* KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
* int arg1,
* int arg2) KHRONOS_APIATTRIBUTES;
*/
#if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC)
# define KHRONOS_STATIC 1
#endif
/*-------------------------------------------------------------------------
* Definition of KHRONOS_APICALL
*-------------------------------------------------------------------------
* This precedes the return type of the function in the function prototype.
*/
#if defined(KHRONOS_STATIC)
/* If the preprocessor constant KHRONOS_STATIC is defined, make the
* header compatible with static linking. */
# define KHRONOS_APICALL
#elif defined(_WIN32)
# define KHRONOS_APICALL __declspec(dllimport)
#elif defined (__SYMBIAN32__)
# define KHRONOS_APICALL IMPORT_C
#elif defined(__ANDROID__)
# define KHRONOS_APICALL __attribute__((visibility("default")))
#else
# define KHRONOS_APICALL
#endif
/*-------------------------------------------------------------------------
* Definition of KHRONOS_APIENTRY
*-------------------------------------------------------------------------
* This follows the return type of the function and precedes the function
* name in the function prototype.
*/
#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
/* Win32 but not WinCE */
# define KHRONOS_APIENTRY __stdcall
#else
# define KHRONOS_APIENTRY
#endif
/*-------------------------------------------------------------------------
* Definition of KHRONOS_APIATTRIBUTES
*-------------------------------------------------------------------------
* This follows the closing parenthesis of the function prototype arguments.
*/
#if defined (__ARMCC_2__)
#define KHRONOS_APIATTRIBUTES __softfp
#else
#define KHRONOS_APIATTRIBUTES
#endif
/*-------------------------------------------------------------------------
* basic type definitions
*-----------------------------------------------------------------------*/
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
/*
* Using <stdint.h>
*/
#include <stdint.h>
typedef int32_t khronos_int32_t;
typedef uint32_t khronos_uint32_t;
typedef int64_t khronos_int64_t;
typedef uint64_t khronos_uint64_t;
#define KHRONOS_SUPPORT_INT64 1
#define KHRONOS_SUPPORT_FLOAT 1
/*
* To support platform where unsigned long cannot be used interchangeably with
* inptr_t (e.g. CHERI-extended ISAs), we can use the stdint.h intptr_t.
* Ideally, we could just use (u)intptr_t everywhere, but this could result in
* ABI breakage if khronos_uintptr_t is changed from unsigned long to
* unsigned long long or similar (this results in different C++ name mangling).
* To avoid changes for existing platforms, we restrict usage of intptr_t to
* platforms where the size of a pointer is larger than the size of long.
*/
#if defined(__SIZEOF_LONG__) && defined(__SIZEOF_POINTER__)
#if __SIZEOF_POINTER__ > __SIZEOF_LONG__
#define KHRONOS_USE_INTPTR_T
#endif
#endif
#elif defined(__VMS ) || defined(__sgi)
/*
* Using <inttypes.h>
*/
#include <inttypes.h>
typedef int32_t khronos_int32_t;
typedef uint32_t khronos_uint32_t;
typedef int64_t khronos_int64_t;
typedef uint64_t khronos_uint64_t;
#define KHRONOS_SUPPORT_INT64 1
#define KHRONOS_SUPPORT_FLOAT 1
#elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
/*
* Win32
*/
typedef __int32 khronos_int32_t;
typedef unsigned __int32 khronos_uint32_t;
typedef __int64 khronos_int64_t;
typedef unsigned __int64 khronos_uint64_t;
#define KHRONOS_SUPPORT_INT64 1
#define KHRONOS_SUPPORT_FLOAT 1
#elif defined(__sun__) || defined(__digital__)
/*
* Sun or Digital
*/
typedef int khronos_int32_t;
typedef unsigned int khronos_uint32_t;
#if defined(__arch64__) || defined(_LP64)
typedef long int khronos_int64_t;
typedef unsigned long int khronos_uint64_t;
#else
typedef long long int khronos_int64_t;
typedef unsigned long long int khronos_uint64_t;
#endif /* __arch64__ */
#define KHRONOS_SUPPORT_INT64 1
#define KHRONOS_SUPPORT_FLOAT 1
#elif 0
/*
* Hypothetical platform with no float or int64 support
*/
typedef int khronos_int32_t;
typedef unsigned int khronos_uint32_t;
#define KHRONOS_SUPPORT_INT64 0
#define KHRONOS_SUPPORT_FLOAT 0
#else
/*
* Generic fallback
*/
#include <stdint.h>
typedef int32_t khronos_int32_t;
typedef uint32_t khronos_uint32_t;
typedef int64_t khronos_int64_t;
typedef uint64_t khronos_uint64_t;
#define KHRONOS_SUPPORT_INT64 1
#define KHRONOS_SUPPORT_FLOAT 1
#endif
/*
* Types that are (so far) the same on all platforms
*/
typedef signed char khronos_int8_t;
typedef unsigned char khronos_uint8_t;
typedef signed short int khronos_int16_t;
typedef unsigned short int khronos_uint16_t;
/*
* Types that differ between LLP64 and LP64 architectures - in LLP64,
* pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
* to be the only LLP64 architecture in current use.
*/
#ifdef KHRONOS_USE_INTPTR_T
typedef intptr_t khronos_intptr_t;
typedef uintptr_t khronos_uintptr_t;
#elif defined(_WIN64)
typedef signed long long int khronos_intptr_t;
typedef unsigned long long int khronos_uintptr_t;
#else
typedef signed long int khronos_intptr_t;
typedef unsigned long int khronos_uintptr_t;
#endif
#if defined(_WIN64)
typedef signed long long int khronos_ssize_t;
typedef unsigned long long int khronos_usize_t;
#else
typedef signed long int khronos_ssize_t;
typedef unsigned long int khronos_usize_t;
#endif
#if KHRONOS_SUPPORT_FLOAT
/*
* Float type
*/
typedef float khronos_float_t;
#endif
#if KHRONOS_SUPPORT_INT64
/* Time types
*
* These types can be used to represent a time interval in nanoseconds or
* an absolute Unadjusted System Time. Unadjusted System Time is the number
* of nanoseconds since some arbitrary system event (e.g. since the last
* time the system booted). The Unadjusted System Time is an unsigned
* 64 bit value that wraps back to 0 every 584 years. Time intervals
* may be either signed or unsigned.
*/
typedef khronos_uint64_t khronos_utime_nanoseconds_t;
typedef khronos_int64_t khronos_stime_nanoseconds_t;
#endif
/*
* Dummy value used to pad enum types to 32 bits.
*/
#ifndef KHRONOS_MAX_ENUM
#define KHRONOS_MAX_ENUM 0x7FFFFFFF
#endif
/*
* Enumerated boolean type
*
* Values other than zero should be considered to be true. Therefore
* comparisons should not be made against KHRONOS_TRUE.
*/
typedef enum {
KHRONOS_FALSE = 0,
KHRONOS_TRUE = 1,
KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
} khronos_boolean_enum_t;
#endif /* __khrplatform_h_ */

1947
External/glad/glad.cpp vendored

File diff suppressed because it is too large Load Diff

3649
External/glad/glad.h vendored

File diff suppressed because it is too large Load Diff

41
HIP-Basic/CMakeLists.txt

@ -21,26 +21,63 @@ @@ -21,26 +21,63 @@
# SOFTWARE.
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(HIP-Basic)
project(HIP-Basic LANGUAGES CXX)
# Only supported on HIP (not CUDA)
if(NOT "${GPU_RUNTIME}" STREQUAL "CUDA")
add_subdirectory(assembly_to_executable)
add_subdirectory(llvm_ir_to_executable)
add_subdirectory(module_api)
endif()
add_subdirectory(bandwidth)
add_subdirectory(bit_extract)
add_subdirectory(cooperative_groups)
add_subdirectory(device_query)
add_subdirectory(device_globals)
add_subdirectory(dynamic_shared)
add_subdirectory(events)
add_subdirectory(gpu_arch)
if(NOT WIN32)
add_subdirectory(hello_world)
add_subdirectory(hipify)
find_package(Perl)
if(Perl_FOUND)
add_subdirectory(hipify)
else()
message("Perl not found, not building hipify example")
endif()
endif()
add_subdirectory(inline_assembly)
add_subdirectory(matrix_multiplication)
add_subdirectory(moving_average)
add_subdirectory(multi_gpu_data_transfer)
add_subdirectory(occupancy)
add_subdirectory(runtime_compilation)
add_subdirectory(saxpy)
add_subdirectory(shared_memory)
add_subdirectory(static_host_library)
add_subdirectory(streams)
# temporarily exclude texture management on Windows
if(NOT WIN32)
add_subdirectory(texture_management)
endif()
add_subdirectory(warp_shuffle)
find_package(glfw3)
find_package(Vulkan COMPONENTS glslangValidator)
if(NOT glfw3_FOUND)
message("GLFW not found, not building OpenGL interop example")
else()
add_subdirectory(opengl_interop)
endif()
if(NOT glfw3_FOUND)
message("GLFW not found, not building Vulkan interop example")
elseif(NOT Vulkan_FOUND)
message("Vulkan not found, not building Vulkan interop example")
else()
add_subdirectory(vulkan_interop)
endif()

19
HIP-Basic/Makefile

@ -22,24 +22,37 @@ @@ -22,24 +22,37 @@
EXAMPLES := \
bandwidth \
bit_extract \
cooperative_groups \
device_query \
device_globals \
dynamic_shared \
events \
gpu_arch \
hello_world \
hipify \
inline_assembly \
matrix_multiplication \
moving_average \
multi_gpu_data_transfer \
occupancy \
opengl_interop \
runtime_compilation \
saxpy \
shared_memory \
streams \
static_host_library \
texture_management \
vulkan_interop \
warp_shuffle
# Only supported on HIP (not CUDA).
ifneq ($(GPU_RUNTIME), CUDA)
EXAMPLES += \
assembly_to_executable \
llvm_ir_to_executable
EXAMPLES += \
assembly_to_executable \
llvm_ir_to_executable \
module_api \
static_device_library
endif
all: $(EXAMPLES)

45
HIP-Basic/README.md

@ -0,0 +1,45 @@ @@ -0,0 +1,45 @@
# HIP-Basic Examples
## Summary
The examples in this subdirectory showcase the functionality of the HIP runtime. The examples build on Linux for the ROCm (AMD GPU) backend. Some examples additionally support Windows, some examples additionally support the CUDA (NVIDIA GPU) backend.
## Prerequisites
### Linux
- [CMake](https://cmake.org/download/) (at least version 3.21)
- OR GNU Make - available via the distribution's package manager
- [ROCm](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.1.3/page/Overview_of_ROCm_Installation_Methods.html) (at least version 5.x.x)
### Windows
- [Visual Studio](https://visualstudio.microsoft.com/) 2019 or 2022 with the "Desktop Development with C++" workload
- ROCm toolchain for Windows (No public release yet)
- The Visual Studio ROCm extension needs to be installed to build with the solution files.
- [CMake](https://cmake.org/download/) (optional, to build with CMake. Requires at least version 3.21)
- [Ninja](https://ninja-build.org/) (optional, to build with CMake)
## Building
### Linux
Make sure that the dependencies are installed, or use one of the [provided Dockerfiles](../../Dockerfiles/) to build and run the examples in a containerized environment.
#### Using CMake
All examples in the `HIP-Basic` subdirectory can either be built by a single CMake project or be built independently.
- `$ cd Libraries/HIP-Basic`
- `$ cmake -S . -B build` (on ROCm) or `$ cmake -S . -B build -D GPU_RUNTIME=CUDA` (on CUDA, when supported)
- `$ cmake --build build`
#### Using Make
All examples can be built by a single invocation to Make or be built independently.
- `$ cd Libraries/HIP-Basic`
- `$ make` (on ROCm) or `$ make GPU_RUNTIME=CUDA` (on CUDA, when supported)
### Windows
Not all HIP runtime examples support building on Windows. See the README file in the directory of the example for more details.
#### Visual Studio
Visual Studio solution files are available for the individual examples. To build all supported HIP runtime examples open the top level solution file [ROCm-Examples-VS2019.sln](../../ROCm-Examples-VS2019.sln) and filter for HIP-Basic.
For more detailed build instructions refer to the top level [README.md](../../README.md#visual-studio).
#### CMake
All examples in the `HIP-Basic` subdirectory can either be built by a single CMake project or be built independently. For build instructions refer to the top-level [README.md](../../README.md#cmake-2).

17
HIP-Basic/assembly_to_executable/Makefile

@ -23,10 +23,9 @@ COMMON_INCLUDE_DIR := ../../Common @@ -23,10 +23,9 @@ COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME ?= HIP
ifneq ($(GPU_RUNTIME), HIP)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
endif
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
@ -37,11 +36,11 @@ LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc @@ -37,11 +36,11 @@ LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc
CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD) $(CXXFLAGS)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) $(CPPFLAGS)
ILDFLAGS := $(LDFLAGS)
ILDLIBS := $(LDLIBS)
# Compile for these GPU architectures
HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030
@ -60,7 +59,7 @@ GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amd @@ -60,7 +59,7 @@ GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amd
all: $(EXAMPLE)
$(EXAMPLE): main.o main_device.o
$(HIPCXX) -o $@ $^
$(HIPCXX) $(ILDFLAGS) -o $@ $^ $(ILDLIBS)
main_device.o: hip_obj_gen.mcin offload_bundle.hipfb
$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj
@ -73,7 +72,7 @@ offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) @@ -73,7 +72,7 @@ offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o)
-output=$@
main.o: main.hip
$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $<
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) -c --cuda-host-only $<
main_%.o: main_%.s
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $<

50
HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln

@ -1,25 +1,25 @@ @@ -1,25 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {5EAD9B5F-41B6-452E-922F-D5782C75EB8F}
EndGlobalSection
EndGlobal

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64
{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {5EAD9B5F-41B6-452E-922F-D5782C75EB8F}
EndGlobalSection
EndGlobal

366
HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj

@ -1,183 +1,183 @@ @@ -1,183 +1,183 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="hip_obj_gen_win.mcin">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
</CustomBuild>
<CustomBuild Include="main_gfx1030.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
</CustomBuild>
<CustomBuild Include="main_gfx803.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
</CustomBuild>
<CustomBuild Include="main_gfx900.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
</CustomBuild>
<CustomBuild Include="main_gfx906.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
</CustomBuild>
<CustomBuild Include="main_gfx908.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
</CustomBuild>
<CustomBuild Include="main_gfx90a.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
</CustomBuild>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{60b4ade0-8286-46ae-b884-5da51b541ded}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>assembly_to_executable_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device Assembly %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device Assembly %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="hip_obj_gen_win.mcin">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
</CustomBuild>
<CustomBuild Include="main_gfx1030.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
</CustomBuild>
<CustomBuild Include="main_gfx803.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
</CustomBuild>
<CustomBuild Include="main_gfx900.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
</CustomBuild>
<CustomBuild Include="main_gfx906.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
</CustomBuild>
<CustomBuild Include="main_gfx908.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
</CustomBuild>
<CustomBuild Include="main_gfx90a.s">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
</CustomBuild>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{60b4ade0-8286-46ae-b884-5da51b541ded}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>assembly_to_executable_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device Assembly %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device Assembly %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

106
HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters

@ -1,53 +1,53 @@ @@ -1,53 +1,53 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="main_gfx90a.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx803.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx900.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx906.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx908.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx90a.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx1030.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="hip_obj_gen_win.mcin">
<Filter>Source Files</Filter>
</CustomBuild>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="main_gfx90a.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx803.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx900.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx906.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx908.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx90a.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx1030.s">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="hip_obj_gen_win.mcin">
<Filter>Source Files</Filter>
</CustomBuild>
</ItemGroup>
</Project>

2
HIP-Basic/assembly_to_executable/main.hip

@ -31,7 +31,7 @@ @@ -31,7 +31,7 @@
/// \brief Device function to square each element
/// in the array `in` and write to array `out`.
template<typename T>
__global__ void vector_square_kernel(T* out, const T* in, const long long size)
__global__ void vector_square_kernel(T* out, const T* in, const unsigned long long size)
{
// Get the unique global thread ID
const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;

78
HIP-Basic/assembly_to_executable/main_gfx1030.s

@ -1,55 +1,55 @@ @@ -1,55 +1,55 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx1030"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[2:3], s[6:7], 0x10
v_mov_b32_e32 v1, 0
s_waitcnt lgkmcnt(0)
s_and_b32 s0, s0, 0xffff
s_mul_i32 s8, s8, s0
v_add_nc_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
s_and_saveexec_b32 s0, vcc_lo
s_cbranch_execz BB0_3
v_mad_u64_u32 v[0:1], null, s8, s0, v[0:1]
v_mov_b32_e32 v1, 0
s_mov_b32 s0, exec_lo
v_cmpx_gt_u64_e64 s[2:3], v[0:1]
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s8, s[4:5], 0xc
s_load_dwordx4 s[4:7], s[6:7], 0x0
v_lshlrev_b64 v[2:3], 2, v[0:1]
s_mov_b32 s9, 0
s_mov_b32 s1, s9
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[10:11], s[8:9], 2
.p2align 6
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_add_co_u32 v4, vcc_lo, s6, v2
v_add_co_ci_u32_e32 v5, vcc_lo, s7, v3, vcc_lo
v_add_co_u32 v0, vcc_lo, v0, s8
v_add_co_ci_u32_e32 v1, vcc_lo, s9, v1, vcc_lo
v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
global_load_dword v6, v[4:5], off
v_add_co_u32 v4, vcc_lo, s4, v2
v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
v_add_co_u32 v2, s0, v2, s10
v_add_co_ci_u32_e64 v3, s0, s11, v3, s0
s_or_b32 s1, vcc_lo, s1
s_or_b32 s9, vcc_lo, s9
s_waitcnt vmcnt(0)
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b32 exec_lo, exec_lo, s1
s_cbranch_execnz BB0_2
BB0_3:
s_andn2_b32 exec_lo, exec_lo, s9
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -77,6 +77,7 @@ BB0_3: @@ -77,6 +77,7 @@ BB0_3:
.amdhsa_workgroup_processor_mode 1
.amdhsa_memory_ordered 1
.amdhsa_forward_progress 0
.amdhsa_shared_vgpr_count 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
@ -85,13 +86,13 @@ BB0_3: @@ -85,13 +86,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 188
; codeLenInByte = 212
; NumSgprs: 14
; NumVgprs: 7
; ScratchSize: 0
@ -147,7 +148,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -147,7 +148,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -169,44 +170,19 @@ amdhsa.kernels: @@ -169,44 +170,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 14
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 7
.vgpr_spill_count: 0
.wavefront_size: 32

61
HIP-Basic/assembly_to_executable/main_gfx803.s

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx803"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[10:11], s[6:7], 0x10
@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
v_add_u32_e32 v0, vcc, s8, v0
v_cmp_gt_u64_e32 vcc, s[10:11], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz BB0_3
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s8, s[4:5], 0xc
s_load_dwordx4 s[4:7], s[6:7], 0x0
@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
s_mov_b64 s[14:15], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[12:13], s[8:9], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s7
v_add_u32_e32 v4, vcc, s6, v2
v_addc_u32_e32 v5, vcc, v5, v3, vcc
@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1 @@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1
v_mul_f32_e32 v6, v6, v6
flat_store_dword v[4:5], v6
s_andn2_b64 exec, exec, s[14:15]
s_cbranch_execnz BB0_2
BB0_3:
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -83,13 +85,13 @@ BB0_3: @@ -83,13 +85,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 200
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; ScratchSize: 0
@ -142,7 +144,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -142,7 +144,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -164,44 +166,19 @@ amdhsa.kernels: @@ -164,44 +166,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64

61
HIP-Basic/assembly_to_executable/main_gfx900.s

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx900"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[12:13], s[6:7], 0x10
@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
v_add_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz BB0_3
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s14, s[4:5], 0xc
s_load_dwordx4 s[8:11], s[6:7], 0x0
@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
s_mov_b64 s[6:7], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[4:5], s[14:15], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s11
v_add_co_u32_e32 v4, vcc, s10, v2
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1 @@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b64 exec, exec, s[6:7]
s_cbranch_execnz BB0_2
BB0_3:
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -85,13 +87,13 @@ BB0_3: @@ -85,13 +87,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 200
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; ScratchSize: 0
@ -144,7 +146,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -144,7 +146,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -166,44 +168,19 @@ amdhsa.kernels: @@ -166,44 +168,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64

61
HIP-Basic/assembly_to_executable/main_gfx906.s

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx906"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[12:13], s[6:7], 0x10
@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
v_add_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz BB0_3
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s14, s[4:5], 0xc
s_load_dwordx4 s[8:11], s[6:7], 0x0
@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
s_mov_b64 s[6:7], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[4:5], s[14:15], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s11
v_add_co_u32_e32 v4, vcc, s10, v2
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1 @@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b64 exec, exec, s[6:7]
s_cbranch_execnz BB0_2
BB0_3:
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -85,13 +87,13 @@ BB0_3: @@ -85,13 +87,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 200
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; ScratchSize: 0
@ -144,7 +146,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -144,7 +146,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -166,44 +168,19 @@ amdhsa.kernels: @@ -166,44 +168,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64

64
HIP-Basic/assembly_to_executable/main_gfx908.s

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx908"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[12:13], s[6:7], 0x10
@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
v_add_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz BB0_3
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s14, s[4:5], 0xc
s_load_dwordx4 s[8:11], s[6:7], 0x0
@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
s_mov_b64 s[6:7], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[4:5], s[14:15], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s11
v_add_co_u32_e32 v4, vcc, s10, v2
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1 @@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b64 exec, exec, s[6:7]
s_cbranch_execnz BB0_2
BB0_3:
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -85,13 +87,13 @@ BB0_3: @@ -85,13 +87,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 200
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; NumAgprs: 0
@ -146,7 +148,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -146,7 +148,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -156,7 +158,8 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -156,7 +158,8 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .agpr_count: 0
.args:
- .address_space: global
.offset: 0
.size: 8
@ -168,44 +171,19 @@ amdhsa.kernels: @@ -168,44 +171,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64

64
HIP-Basic/assembly_to_executable/main_gfx90a.s

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx90a"
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[12:13], s[6:7], 0x10
@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -15,7 +16,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
v_add_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz BB0_3
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s14, s[4:5], 0xc
s_load_dwordx4 s[8:11], s[6:7], 0x0
@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x @@ -24,7 +25,7 @@ _Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
s_mov_b64 s[6:7], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[4:5], s[14:15], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s11
v_add_co_u32_e32 v4, vcc, s10, v2
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1 @@ -44,15 +45,16 @@ BB0_2: ; =>This Inner Loop Header: Depth=1
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b64 exec, exec, s[6:7]
s_cbranch_execnz BB0_2
BB0_3:
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 80
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
@ -87,13 +89,13 @@ BB0_3: @@ -87,13 +89,13 @@ BB0_3:
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 200
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; NumAgprs: 0
@ -154,7 +156,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -154,7 +156,7 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
@ -164,7 +166,8 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: @@ -164,7 +166,8 @@ _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .agpr_count: 0
.args:
- .address_space: global
.offset: 0
.size: 8
@ -176,44 +179,19 @@ amdhsa.kernels: @@ -176,44 +179,19 @@ amdhsa.kernels:
- .offset: 16
.size: 8
.value_kind: by_value
- .offset: 24
.size: 8
.value_kind: hidden_global_offset_x
- .offset: 32
.size: 8
.value_kind: hidden_global_offset_y
- .offset: 40
.size: 8
.value_kind: hidden_global_offset_z
- .address_space: global
.offset: 48
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 56
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 64
.size: 8
.value_kind: hidden_none
- .address_space: global
.offset: 72
.size: 8
.value_kind: hidden_multigrid_sync_arg
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 80
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_x
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64

26
HIP-Basic/bandwidth/Makefile

@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

50
HIP-Basic/bandwidth/bandwidth_vs2019.sln

@ -1,25 +1,25 @@ @@ -1,25 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {1E2ACB7F-1706-491A-9E62-395C1BD8E637}
EndGlobalSection
EndGlobal

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64
{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {1E2ACB7F-1706-491A-9E62-395C1BD8E637}
EndGlobalSection
EndGlobal

204
HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj

@ -1,102 +1,102 @@ @@ -1,102 +1,102 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\cmdparser.hpp" />
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{16b11b54-cd72-43b6-b226-38c668b41a79}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>bandwidth_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\cmdparser.hpp" />
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{16b11b54-cd72-43b6-b226-38c668b41a79}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>bandwidth_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

58
HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters

@ -1,30 +1,30 @@ @@ -1,30 +1,30 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{c71d9db2-bf13-49ee-b794-626d24391150}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{520f4985-c9bd-4add-9485-049fafe0cdca}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{006f799a-d711-49a7-93da-7f60d8872b02}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\cmdparser.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{c71d9db2-bf13-49ee-b794-626d24391150}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{520f4985-c9bd-4add-9485-049fafe0cdca}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{006f799a-d711-49a7-93da-7f60d8872b02}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\cmdparser.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

1
HIP-Basic/bit_extract/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
hip_bit_extract

57
HIP-Basic/bit_extract/CMakeLists.txt

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name hip_bit_extract)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
HIP-Basic/bit_extract/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := hip_bit_extract
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

33
HIP-Basic/bit_extract/README.md

@ -0,0 +1,33 @@ @@ -0,0 +1,33 @@
# HIP-Basic Bit Extract Example
## Description
A HIP-specific bit extract solution is presented in this example.
### Application flow
1. Allocate memory for host vectors.
2. Fill the input host vector as an arithmetic sequence by the vector index.
3. Allocate memory for device arrays.
4. Copy the arithmetic sequence from the host to device memory.
5. Apply bit extract operator on the sequence element by element and return with result array. If we use HIP, __bitextract_u32() device function is used, otherwise the standard bit shift operator.
6. Copy the result sequence from the device to the host memory
7. Compare the result sequence to the expected sequence, element by element. If a mismatch is detected, the vector index and both values are printed, and the program exits with an error code.
8. Deallocate device and host memory.
9. "PASSED!" is printed when the flow was successful.
## Key APIs and Concepts
- `hipLaunchKernelGGL(kernel_name, grid_dim, block_dim, dynamic_shared_memory_size, stream, <kernel arguments>)` is the HIP kernel launcher where the grid and block dimension, dynamic shared memory size and HIP stream is defined. We use NULL stream in the recent example.
- `__bitextract_u32(source, bit_start, num_bits)` is the built-in AMD HIP bit extract operator, where we define a source scalar, a `bit_start` start bit and a `num_bits` number of extraction bits. The operator returns with a scalar value.
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `threadIdx`, `blockIdx`, `blockDim`, `gridDim`
- `__bitextract_u32`
#### Host symbols
- `hipMalloc`
- `hipFree`
- `hipMemcpy`
- `hipMemcpyHostToDevice`
- `hipMemcpyDeviceToHost`
- `hipLaunchKernelGGL`

25
HIP-Basic/bit_extract/bit_extract_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bit_extract_vs2019", "bit_extract_vs2019.vcxproj", "{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}.Debug|x64.ActiveCfg = Debug|x64
{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}.Debug|x64.Build.0 = Debug|x64
{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}.Release|x64.ActiveCfg = Release|x64
{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0}
EndGlobalSection
EndGlobal

99
HIP-Basic/bit_extract/bit_extract_vs2019.vcxproj

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{63823DD0-787C-42AE-B6E7-C03CF4CF5CE2}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>bit_extract_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

27
HIP-Basic/bit_extract/bit_extract_vs2019.vcxproj.filters

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{2932a426-602b-4926-887e-27c50ba7eab7}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{ed043ec4-e8ac-4831-93f5-a58546ec7bea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{0da954bd-e555-4454-b082-b68d10c753b9}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

111
HIP-Basic/bit_extract/main.hip

@ -0,0 +1,111 @@ @@ -0,0 +1,111 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
/// \brief A simple bit extract kernel for unsigned 32-bit integer that returns an 8-bit extracted array.
/// - If the code was compiled on AMD HIP platform, the __bitextract_u32() built-in function is used.
/// - Otherwise default C++ extract operator is used.
__global__ void bit_extract_kernel(uint32_t* d_output, const uint32_t* d_input, size_t size)
{
const size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
const size_t stride = blockDim.x * gridDim.x;
for(size_t i = offset; i < size; i += stride)
{
#ifdef __HIP_PLATFORM_AMD__
d_output[i] = __bitextract_u32(d_input[i], 8, 4);
#else /* defined __HIP_PLATFORM_NVIDIA__ or other path */
d_output[i] = ((d_input[i] & 0xf00) >> 8);
#endif
}
}
int main()
{
constexpr size_t size = 1000000;
constexpr size_t size_in_bytes = size * sizeof(uint32_t);
// Allocate host vectors
std::vector<uint32_t> h_input(size);
std::vector<uint32_t> h_output(size);
// Set up input data
for(size_t i = 0; i < size; i++)
{
h_input[i] = i;
}
// Allocate device memory for the input and output data
uint32_t *d_input, *d_output;
HIP_CHECK(hipMalloc(&d_input, size_in_bytes));
HIP_CHECK(hipMalloc(&d_output, size_in_bytes));
// Copy data from host to device
HIP_CHECK(hipMemcpy(d_input, h_input.data(), size_in_bytes, hipMemcpyHostToDevice));
// Launch bit_extract_kernel()
constexpr unsigned int number_of_blocks = 512;
constexpr unsigned int threads_per_block = 256;
hipLaunchKernelGGL(bit_extract_kernel,
dim3(number_of_blocks),
dim3(threads_per_block),
0,
hipStreamDefault,
d_output,
d_input,
size);
// Copy data from device to host
HIP_CHECK(hipMemcpy(h_output.data(), d_output, size_in_bytes, hipMemcpyDeviceToHost));
// Free device memory
HIP_CHECK(hipFree(d_input));
HIP_CHECK(hipFree(d_output));
// Check result validity
unsigned int errors{};
for(size_t i = 0; i < size; i++)
{
uint32_t reference_value = ((h_input[i] & 0xf00) >> 8);
if(h_output[i] != reference_value)
{
errors++;
}
}
if(errors != 0)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}

1
HIP-Basic/cooperative_groups/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
hip_cooperative_groups

57
HIP-Basic/cooperative_groups/CMakeLists.txt

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name hip_cooperative_groups)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
HIP-Basic/cooperative_groups/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := hip_cooperative_groups
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

41
HIP-Basic/cooperative_groups/README.md

@ -0,0 +1,41 @@ @@ -0,0 +1,41 @@
# HIP-Basic Cooperative Groups Example
## Description
This program showcases the usage of Cooperative Groups inside a reduction kernel.
Cooperative groups can be used to gain more control over synchronization.
For more insights, you can read the following blog post:
[Cooperative Groups: Flexible CUDA Thread Programming](https://developer.nvidia.com/blog/cooperative-groups/)
### Application flow
1. A number of variables are defined to control the problem details and the kernel launch parameters.
2. Input vector is set up in host memory.
3. The input is copied to the device.
4. The GPU reduction kernel is launched with previously defined arguments.
5. The kernel will perform two reductions: a reduction of the whole threadblock and a reduction of custom partitions.
6. The result vectors are copied back to the host and all device memory is freed.
7. The elements of the result vectors are compared with the expected result. The result of the comparison is printed to the standard output.
## Key APIs and Concepts
Usually, programmers can only synchronize on warp-level or block-level.
But cooperative groups allows the programmer to partition threads together and subsequently synchronize those groups.
The partitioned threads can reside across multiple devices.
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `thread_group`
- `thread_block`
- `tiled_partition<size>()`
- `thread_block_tile`
- All above from the [`cooperative_groups` namespace](https://github.com/ROCm-Developer-Tools/hipamd/blob/develop/include/hip/amd_detail/amd_hip_cooperative_groups.h)
#### Host symbols
- `hipMalloc`
- `hipMemcpy`
- `hipLaunchCooperativeKernel`
- `hipDeviceAttributeCooperativeLaunch`
- `hipDeviceGetAttribute`
- `HIP_KERNEL_NAME`
- `hipGetLastError`
- `hipFree`

25
HIP-Basic/cooperative_groups/cooperative_groups_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cooperative_groups_vs2019", "cooperative_groups_vs2019.vcxproj", "{7A25CE69-BACE-4410-BEB0-12A69890F212}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{7A25CE69-BACE-4410-BEB0-12A69890F212}.Debug|x64.ActiveCfg = Debug|x64
{7A25CE69-BACE-4410-BEB0-12A69890F212}.Debug|x64.Build.0 = Debug|x64
{7A25CE69-BACE-4410-BEB0-12A69890F212}.Release|x64.ActiveCfg = Release|x64
{7A25CE69-BACE-4410-BEB0-12A69890F212}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {69A6C748-F535-4DEF-85D1-54825AB819B9}
EndGlobalSection
EndGlobal

99
HIP-Basic/cooperative_groups/cooperative_groups_vs2019.vcxproj

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{7a25ce69-bace-4410-beb0-12a69890f212}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>cooperative_groups_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

27
HIP-Basic/cooperative_groups/cooperative_groups_vs2019.vcxproj.filters

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{25db88ec-6f1f-49d0-bd14-b0b028a2f0b6}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{c165da41-0b12-43fe-afa0-eb1ce67ad002}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{5905baad-b4ce-4f30-b9a8-274cdfeea1e0}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

249
HIP-Basic/cooperative_groups/main.hip

@ -0,0 +1,249 @@ @@ -0,0 +1,249 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <cstddef>
#include <cstdlib>
using namespace cooperative_groups;
/// \brief Summation of `unsigned int val`'s in `thread_group g` using shared memory `x`
__device__ unsigned int reduce_sum(thread_group g, unsigned int* x, unsigned int val)
{
// Rank of this thread in the group
const unsigned int group_thread_id = g.thread_rank();
// We start with half the group size as active threads
// Every iteration the number of active threads halves, until we processed all values
for(unsigned int i = g.size() / 2; i > 0; i /= 2)
{
// Store value for this thread in a shared, temporary array
x[group_thread_id] = val;
// Synchronize all threads in the group
g.sync();
// If our thread is still active, sum with its counterpart in the other half
if(group_thread_id < i)
{
val += x[group_thread_id + i];
}
// Synchronize all threads in the group
g.sync();
}
// Only the first thread returns a valid value
if(g.thread_rank() == 0)
return val;
else
return 0;
}
/// \brief A vector reduction kernel showcasing the use of cooperative groups.
/// - First we showcase the use of threadBlockGroup.
/// - Second we showcase the use of `tiled_partition<>()`.
/// \param partition_size The number of elements in a cooperative group's tiled_partition.
template<unsigned int PartitionSize>
__global__ void vector_reduce_kernel(const unsigned int* d_vector,
unsigned int* d_block_reduced_vector,
unsigned int* d_partition_reduced_vector)
{
// threadBlockGroup consists of all threads in the block
thread_block thread_block_group = this_thread_block();
// Workspace array in shared memory required for reduction
__shared__ unsigned int workspace[2048];
unsigned int output;
// Input to reduce
const unsigned int input = d_vector[thread_block_group.thread_rank()];
// Perform reduction
output = reduce_sum(thread_block_group, workspace, input);
// Only the first thread returns a valid value
if(thread_block_group.thread_rank() == 0)
{
d_block_reduced_vector[0] = output;
}
// Every custom_partition group consists of 16 threads
thread_block_tile<PartitionSize> custom_partition
= tiled_partition<PartitionSize>(thread_block_group);
// To make sure every partition has its own piece of shared memory it can work with
const unsigned int group_offset
= thread_block_group.thread_rank() - custom_partition.thread_rank();
// Perform reduction
output = reduce_sum(custom_partition, &workspace[group_offset], input);
// Only the first thread in each partition returns a valid value
if(custom_partition.thread_rank() == 0)
{
const unsigned int partition_id = thread_block_group.thread_rank() / PartitionSize;
d_partition_reduced_vector[partition_id] = output;
}
return;
}
// Host side function to perform the same reductions as executed on the GPU
std::vector<unsigned int> ref_reduced(const unsigned int partition_size,
std::vector<unsigned int> input)
{
const unsigned int input_size = input.size();
const unsigned int result_size = input_size / partition_size;
std::vector<unsigned int> result(result_size);
for(unsigned int i = 0; i < result_size; i++)
{
unsigned int partition_result = 0;
for(unsigned int j = 0; j < partition_size; j++)
{
partition_result += input[partition_size * i + j];
}
result[i] = partition_result;
}
return result;
}
int main()
{
#ifdef __HIP_PLATFORM_AMD__
int device = 0;
int supports_coop_launch = 0;
// Check support
// Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(
hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device));
if(!supports_coop_launch)
{
std::cout << "Skipping, device " << device << " does not support cooperative groups"
<< std::endl;
return 0;
}
#endif
// Number of blocks to launch.
constexpr unsigned int num_blocks = 1;
// Number of threads in each kernel block.
constexpr unsigned int threads_per_block = 64;
// Total element count of the input vector.
constexpr unsigned int size = num_blocks * threads_per_block;
// Total elements count of a tiled_partition.
constexpr unsigned int partition_size = 16;
// Total size (in bytes) of the input vector.
constexpr size_t size_bytes = sizeof(unsigned int) * size;
static_assert(threads_per_block % partition_size == 0,
"threads_per_block must be a multiple of partition_size");
// Allocate host vectors.
std::vector<unsigned int> h_vector(size);
std::vector<unsigned int> h_block_reduced(num_blocks);
std::vector<unsigned int> h_partition_reduced(threads_per_block / partition_size);
// Set up input data.
for(unsigned int i = 0; i < size; i++)
{
h_vector[i] = i;
}
// Allocate device memory for the input and output matrices.
unsigned int* d_vector{};
unsigned int* d_block_reduced{};
unsigned int* d_partition_reduced{};
HIP_CHECK(hipMalloc(&d_vector, size_bytes));
HIP_CHECK(hipMalloc(&d_block_reduced, sizeof(unsigned int) * h_block_reduced.size()));
HIP_CHECK(hipMalloc(&d_partition_reduced, sizeof(unsigned int) * h_partition_reduced.size()));
// Transfer the input vector to the device memory.
HIP_CHECK(hipMemcpy(d_vector, h_vector.data(), size_bytes, hipMemcpyHostToDevice));
void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
// Launching kernel from host.
HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel<partition_size>,
dim3(num_blocks),
dim3(threads_per_block),
params,
0,
hipStreamDefault));
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Transfer the result back to the host.
HIP_CHECK(hipMemcpy(h_block_reduced.data(),
d_block_reduced,
sizeof(unsigned int) * h_block_reduced.size(),
hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(h_partition_reduced.data(),
d_partition_reduced,
sizeof(unsigned int) * h_partition_reduced.size(),
hipMemcpyDeviceToHost));
// Free the resources on the device.
HIP_CHECK(hipFree(d_vector));
HIP_CHECK(hipFree(d_block_reduced));
HIP_CHECK(hipFree(d_partition_reduced));
// Perform the reference (CPU) calculation.
std::vector<unsigned int> ref_block_reduced = ref_reduced(threads_per_block, h_vector);
std::vector<unsigned int> ref_partition_reduced = ref_reduced(partition_size, h_vector);
// Check the results' validity.
unsigned int errors{};
for(unsigned int i = 0; i < h_block_reduced.size(); i++)
{
errors += (h_block_reduced[i] != ref_block_reduced[i]);
}
for(unsigned int i = 0; i < h_partition_reduced.size(); i++)
{
errors += (h_partition_reduced[i] != ref_partition_reduced[i]);
}
if(errors)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}

1
HIP-Basic/device_globals/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
hip_device_globals

57
HIP-Basic/device_globals/CMakeLists.txt

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name device_globals)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
HIP-Basic/device_globals/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := hip_device_globals
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

45
HIP-Basic/device_globals/README.md

@ -0,0 +1,45 @@ @@ -0,0 +1,45 @@
# HIP-Basic Device Globals Example
## Description
This program showcases a simple example that uses device global variables to perform a simple test kernel. Two such global variables are set using different methods: one is a single variable is set by first obtaining a pointer to it and using `hipMemcpy`, as would be done for a pointer to device memory using `hipMalloc`. The other is an array that is initialized without first explicitly obtaining the pointer by using `hipMemcpyToSymbol`.
### Application flow
1. A number of constants are defined for the kernel launch parameters.
2. The input and output vectors are initialized in host memory.
3. The necessary amount of device memory for the input and output vectors is allocated and the input data is copied to the device.
4. A pointer to the device global variable `global` is obtained via `hipGetSymbolAddress`.
5. The pointee is initialized by copying a value from the host to it.
6. The device global variable `global_array` is initialized by copying to it directly with `hipMemcpyToSymbol`.
7. The GPU kernel is then launched with the previously defined arguments.
8. The results are copied back to the host.
9. Device memory backing the input and output vectors is freed.
10. A reference computation is performed on the host and the results are compared with the expected result. The result of the comparison is printed to standard output.
## Key APIs and Concepts
Apart from via kernel parameters, values can also be passed to the device via _device global variables_: global variables that have the `__device__` attribute. These can be used from device kernels, and need to be initialized from the host before they hold a valid value. Device global variables are persistent between kernel launches, so they can also be used to communicate values between lauches without explicitly managing a buffer for the on the host.
A device global variable cannot be used as a regular global variable from the host side. To manage them, a pointer to the device memory that they represent needs to be obtained first. This can be done using the functions `hipGetSymbolAddress(dev_ptr, symbol)` and `hipGetSymbolSize(dev_ptr, symbol)`. A device global variable can be passed directly to this function by using the `HIP_SYMBOL(symbol)` macro. The resulting device pointer can be used in the same ways as memory obtained from `hipMalloc`, and so the corresponding value can be set by using `hipMemcpy`.
Device global variables may also be initialized directly by using the `hipMemcpyToSymbol(symbol, host_source, size_bytes, offset = 0, kind = hipMemcpyHostToDevice)`. This method omits having to fetch the pointer to the device global variable explicitly. Similarly, `hipMemcpyFromSymbol(host_dest, symbol, size_bytes, offset = 0, kind = hipMemcpyDeviceToHost)` can be used to copy from a device global variable back to the host.
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `__global__`
- `__device__`
- `threadIdx`
- `blockDim`
- `blockIdx`
#### Host symbols
- `hipFree`
- `hipGetLastError`
- `hipGetSymbolAddress`
- `hipGetSymbolSize`
- `hipLaunchKernelGGL`
- `hipMalloc`
- `hipMemcpy`
- `hipMemcpyDeviceToHost`
- `hipMemcpyHostToDevice`
- `hipMemcpyToSymbol`
- `hipStreamDefault`
- `HIP_SYMBOL`

25
HIP-Basic/device_globals/device_globals_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "device_globals_vs2019", "device_globals_vs2019.vcxproj", "{F7DD9451-B0CA-4C76-AB92-0E01CBEBDBBE}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F7DD9451-B0CA-4C76-AB92-0E01CBEBDBBE}.Debug|x64.ActiveCfg = Debug|x64
{F7DD9451-B0CA-4C76-AB92-0E01CBEBDBBE}.Debug|x64.Build.0 = Debug|x64
{F7DD9451-B0CA-4C76-AB92-0E01CBEBDBBE}.Release|x64.ActiveCfg = Release|x64
{F7DD9451-B0CA-4C76-AB92-0E01CBEBDBBE}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {2D151D72-1741-4B0B-99F9-50C182082CFC}
EndGlobalSection
EndGlobal

99
HIP-Basic/device_globals/device_globals_vs2019.vcxproj

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{f7dd9451-b0ca-4c76-ab92-0e01cbebdbbe}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>device_globals_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

27
HIP-Basic/device_globals/device_globals_vs2019.vcxproj.filters

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{b6be5f33-3a87-4cea-900e-720c76b2bdd7}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{116f6a96-2d11-4004-974f-2d651b18763d}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{f1508573-344a-468c-93ba-fa8fccbff0bf}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>

164
HIP-Basic/device_globals/main.hip

@ -0,0 +1,164 @@ @@ -0,0 +1,164 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <cassert>
#include <iostream>
#include <numeric>
#include <vector>
constexpr unsigned int device_array_size = 16;
/// A test global variable of a single element, that will later be set from the host.
__device__ float global;
/// A test global variable of \p device_array_size elements that will be set from the host.
__device__ float global_array[device_array_size];
/// \brief A simple test kernel, that reads from <tt>in</tt>, <tt>global</tt>, and
/// <tt>global_array</tt>. The result will be written to <tt>out</tt>.
__global__ void test_globals_kernel(float* out, const float* in, const size_t size)
{
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid < size)
{
out[tid] = in[tid] + global + global_array[tid % device_array_size];
}
}
/// \brief Computes a reference result on the host, that is (if everything goes well)
/// hopefully equal to the results from the \p test_globals_kernel kernel.
std::vector<float> test_globals_reference(const std::vector<float>& in,
const std::vector<float> global_array,
const float global)
{
std::vector<float> out(in.size());
for(size_t i = 0; i < in.size(); ++i)
{
out[i] = in[i] + global + global_array[i % global_array.size()];
}
return out;
}
int main()
{
// The size of the input and output vectors.
constexpr unsigned int size = 64;
// The total number of bytes in the input and output vectors.
constexpr size_t size_bytes = size * sizeof(float);
// Number of threads per kernel block.
constexpr unsigned int block_size = size;
// Number of blocks per kernel grid. The expression below calculates ceil(size/block_size).
constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
// Allocate host vectors for the input and output.
std::vector<float> h_in(size);
std::vector<float> h_out(size);
// Fill the input with an increasing sequence (i.e. 1, 2, 3, 4...).
std::iota(h_in.begin(), h_in.end(), 1.f);
// Allocate and copy vectors to device memory.
float* d_in{};
float* d_out{};
HIP_CHECK(hipMalloc(&d_in, size_bytes));
HIP_CHECK(hipMalloc(&d_out, size_bytes));
HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_bytes, hipMemcpyHostToDevice));
// Fetch a device pointer to the device variable "global". We can pass the relevant
// symbol directly to this function.
void* d_global{};
size_t global_size_bytes{};
HIP_CHECK(hipGetSymbolAddress(&d_global, HIP_SYMBOL(global)));
HIP_CHECK(hipGetSymbolSize(&global_size_bytes, HIP_SYMBOL(global)));
assert(global_size_bytes == sizeof(float));
// This pointer is a regular device pointer, and so we may use it in the same ways
// as pointers allocated using `hipMalloc`.
constexpr float h_global = 42.f;
HIP_CHECK(hipMemcpy(d_global, &h_global, global_size_bytes, hipMemcpyHostToDevice));
// Set up the inputs for `global_array`.
std::vector<float> h_global_array(device_array_size);
for(size_t i = 0; i < h_global_array.size(); ++i)
{
h_global_array[i] = i * 1000.f;
}
// Initialize `global_array` by copying to it directly, omitting the need to fetch it first.
HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(global_array),
h_global_array.data(),
h_global_array.size() * sizeof(float)));
// Launch the kernel on the default stream and with the above configuration.
hipLaunchKernelGGL(test_globals_kernel,
dim3(block_size),
dim3(grid_size),
0,
hipStreamDefault,
d_out,
d_in,
size);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Copy the results back to the host. This call blocks the host's execution until the copy is finished.
HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_bytes, hipMemcpyDeviceToHost));
// Free device memory.
HIP_CHECK(hipFree(d_in));
HIP_CHECK(hipFree(d_out));
// Compute the expected values on the host.
const std::vector<float> reference = test_globals_reference(h_in, h_global_array, h_global);
// Check the results' validity.
constexpr float eps = 1.0E-6;
unsigned int errors{};
for(size_t i = 0; i < size; ++i)
{
if(std::fabs(h_out[i] - reference[i]) > eps)
{
++errors;
}
}
if(errors != 0)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
return 0;
}

28
HIP-Basic/device_query/Makefile

@ -31,25 +31,31 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,25 +31,31 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -isystem $(HIP_INCLUDE_DIR) -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -isystem $(HIP_INCLUDE_DIR) -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
ICXXFLAGS += -x cu
else ifeq ($(GPU_RUNTIME), HIP)
HIPCXX = $(CXX)
CXXFLAGS += -D__HIP_PLATFORM_AMD__
LDFLAGS += -L $(ROCM_INSTALL_DIR)/lib
LDLIBS += -lamdhip64
CXXFLAGS ?= -Wall -Wextra
HIPCXX := $(CXX)
ICXXFLAGS += -D__HIP_PLATFORM_AMD__
ILDFLAGS += -L $(ROCM_INSTALL_DIR)/lib
ILDLIBS += -lamdhip64
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.cpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

2
HIP-Basic/device_query/device_query_vs2019.vcxproj

@ -66,7 +66,7 @@ @@ -66,7 +66,7 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

22
HIP-Basic/dynamic_shared/Makefile

@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

2
HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj

@ -66,7 +66,7 @@ @@ -66,7 +66,7 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>

22
HIP-Basic/events/Makefile

@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

4
HIP-Basic/events/events_vs2019.vcxproj

@ -66,7 +66,7 @@ @@ -66,7 +66,7 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
@ -96,4 +96,4 @@ @@ -96,4 +96,4 @@
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>
</Project>

1
HIP-Basic/gpu_arch/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
hip_gpu_arch

57
HIP-Basic/gpu_arch/CMakeLists.txt

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name hip_gpu_arch)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
HIP-Basic/gpu_arch/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := hip_gpu_arch
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

33
HIP-Basic/gpu_arch/README.md

@ -0,0 +1,33 @@ @@ -0,0 +1,33 @@
# HIP-Basic GPU Architecture-specific Code Example
## Description
This program showcases an implementation of a simple matrix transpose kernel, which uses a different codepath depending on the target architecture.
### Application flow
1. A number of constants are defined to control the problem details and the kernel launch parameters.
2. Input matrix is set up in host memory.
3. The necessary amount of device memory is allocated and input is copied to the device.
4. The GPU transposition kernel is launched with previously defined arguments.
5. The kernel will have two different codepaths for its data movement, depending on the target architecture.
6. The transposed matrix is copied back to the host and all device memory is freed.
7. The elements of the result matrix are compared with the expected result. The result of the comparison is printed to the standard output.
## Key APIs and Concepts
This example showcases two different codepaths inside a GPU kernel, depending on the target architecture.
You may want to use architecture-specific inline assembly when compiling for a specific architecture, without losing compatibility with other architectures (see the [inline_assembly](/HIP-Basic/inline_assembly/main.hip) example).
These architecture-specific compiler definitions only exist within GPU kernels. If you would like to have GPU architecture-specific host-side code, you could query the stream/device information at runtime.
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `threadIdx`, `blockIdx`, `blockDim`
- `__gfx1010__`, `__gfx1011__`, `__gfx1012__`, `__gfx1030__`, `__gfx1031__`, `__gfx1100__`, `__gfx1101__`, `__gfx1102__`
#### Host symbols
- `hipMalloc`
- `hipMemcpy`
- `hipLaunchKernelGGL`
- `HIP_KERNEL_NAME`
- `hipGetLastError`
- `hipFree`

25
HIP-Basic/gpu_arch/gpu_arch_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC941}") = "gpu_arch_vs2019", "gpu_arch_vs2019.vcxproj", "{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{7B7D1745-7635-40DA-B6AF-B8F728A31123}.Debug|x64.ActiveCfg = Debug|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31123}.Debug|x64.Build.0 = Debug|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31123}.Release|x64.ActiveCfg = Release|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31123}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {50A6F0A7-FE4A-4B74-BE6E-1A354D8AD065}
EndGlobalSection
EndGlobal

99
HIP-Basic/gpu_arch/gpu_arch_vs2019.vcxproj

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{4e6b2034-d7ed-4cb4-98b2-7b2d2b71e0a8}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>gpu_arch_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

27
HIP-Basic/gpu_arch/gpu_arch_vs2019.vcxproj.filters

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{91fb42b0-13d7-42c2-9f9f-edead539556a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{dc73d4e0-b3d9-4216-9237-72e4a97ea387}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{ad5f5a22-1e00-4ee8-89fa-ec5047963ec0}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

150
HIP-Basic/gpu_arch/main.hip

@ -0,0 +1,150 @@ @@ -0,0 +1,150 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <cstddef>
#include <cstdlib>
/// \brief A simple matrix transpose kernel that's using inline assembly.
/// - The number of rows in the input and output matrices is equal, and given by the \p width parameter.
/// - Each thread in the grid is responsible for one element of the input and output matrices.
__global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
#if(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__ || __gfx1100__ \
|| __gfx1101__ || __gfx1102__)
// Codepath for one of the architectures listed above
out[y * width + x] = in[x * width + y];
#else
// Codepath if we're not on one of those architectures
// Note: to check if we're on AMD or NVIDIA hardware, you could use the more generic:
// __HIP_PLATFORM_AMD__ and __HIP_PLATFORM_NVIDIA__
out[x * width + y] = in[y * width + x];
#endif
}
// CPU implementation of matrix transpose
std::vector<float> matrix_transpose_reference(const std::vector<float>& input,
const unsigned int width)
{
std::vector<float> output(width * width);
for(unsigned int j = 0; j < width; j++)
{
for(unsigned int i = 0; i < width; i++)
{
output[i * width + j] = input[j * width + i];
}
}
return output;
}
int main()
{
// Number of rows and columns in the transposed square matrix.
constexpr unsigned int width = 1024;
// Number of threads in each kernel block along the X dimension.
constexpr unsigned int threads_per_block_x = 8;
// Number of threads in each kernel block along the Y dimension.
constexpr unsigned int threads_per_block_y = 8;
// Total element count of the transposed matrix.
constexpr unsigned int size = width * width;
// Total size (in bytes) of the transposed matrix.
constexpr size_t size_bytes = sizeof(float) * size;
// Allocate host vectors.
std::vector<float> h_matrix(size);
std::vector<float> h_transposed_matrix(size);
// Set up input data.
for(unsigned int i = 0; i < size; i++)
{
h_matrix[i] = i * 10.0f;
}
// Allocate device memory for the input and output matrices.
float* d_matrix{};
float* d_transposed_matrix{};
HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
// Transfer the input matrix to the device memory.
HIP_CHECK(hipMemcpy(d_matrix, h_matrix.data(), size_bytes, hipMemcpyHostToDevice));
// Lauching kernel from host.
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_transpose_kernel),
dim3(width / threads_per_block_x, width / threads_per_block_y),
dim3(threads_per_block_x, threads_per_block_y),
0,
hipStreamDefault,
d_transposed_matrix,
d_matrix,
width);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Transfer the result back to the host.
HIP_CHECK(hipMemcpy(h_transposed_matrix.data(),
d_transposed_matrix,
size_bytes,
hipMemcpyDeviceToHost));
// Free the resources on the device.
HIP_CHECK(hipFree(d_matrix));
HIP_CHECK(hipFree(d_transposed_matrix));
// Perform the reference (CPU) calculation.
std::vector<float> ref_transposed_matrix = matrix_transpose_reference(h_matrix, width);
// Check the results' validity.
constexpr float eps = 1.0E-6;
unsigned int errors{};
for(unsigned int i = 0; i < size; i++)
{
if(std::fabs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps)
{
errors++;
}
}
if(errors != 0)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}

22
HIP-Basic/hello_world/Makefile

@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

24
HIP-Basic/hipify/Makefile

@ -30,27 +30,33 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -30,27 +30,33 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS :=
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS :=
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
all: $(EXAMPLE)
# Step
main.hip: main.cu
$(ROCM_INSTALL_DIR)/bin/hipify-perl $< -o $@
$(EXAMPLE): main.hip
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE) main.hip *.o

1
HIP-Basic/inline_assembly/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
hip_inline_assembly

57
HIP-Basic/inline_assembly/CMakeLists.txt

@ -0,0 +1,57 @@ @@ -0,0 +1,57 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name hip_inline_assembly)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
set(GPU_RUNTIMES "HIP" "CUDA")
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
if(GPU_RUNTIME STREQUAL "CUDA")
list(APPEND include_dirs "${ROCM_ROOT}/include")
endif()
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

60
HIP-Basic/inline_assembly/Makefile

@ -0,0 +1,60 @@ @@ -0,0 +1,60 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
EXAMPLE := hip_inline_assembly
COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME := HIP
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)
.PHONY: clean

48
HIP-Basic/inline_assembly/README.md

@ -0,0 +1,48 @@ @@ -0,0 +1,48 @@
# HIP-Basic Inline Assembly Example
## Description
This program showcases an implementation of a simple matrix transpose kernel, which uses inline assembly and works on both AMD and NVIDIA hardware.
By using inline assembly in your kernels, you may be able to gain extra performance.
It could also enable you to use special GPU hardware features which are not available through compiler intrinsics.
For more insights, please read the following blogs by Ben Sander:
[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](https://gpuopen.com/learn/amdgcn-assembly/) &
[AMD GCN Assembly: Cross-Lane Operations](https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations/)
For more information:
[AMD ISA documentation for current architectures](https://gpuopen.com/amd-isa-documentation/) &
[User Guide for LLVM AMDGPU Back-end](https://llvm.org/docs/AMDGPUUsage.html)
### Application flow
1. A number of variables are defined to control the problem details and the kernel launch parameters.
2. Input matrix is set up in host memory.
3. The necessary amount of device memory is allocated and input is copied to the device.
4. The GPU transposition kernel is launched with previously defined arguments.
5. The kernel will use different inline assembly for its data movement, depending on the target platform.
6. The transposed matrix is copied back to the host and all device memory is freed.
7. The elements of the result matrix are compared with the expected result. The result of the comparison is printed to the standard output.
## Key APIs and Concepts
Using inline assembly in GPU kernels is somewhat similar to using inline assembly in host-side code. The `volatile` statement tells the compiler to not remove the assembly statement during optimizations.
```c++
asm volatile("v_mov_b32_e32 %0, %1" : "=v"(variable_0) : "v"(variable_1))
```
However, since the instruction set differs between GPU architectures, you usually want to use the appropriate GPU architecture compiler defines to support multiple architectures (see the [gpu_arch](/HIP-Basic/gpu_arch/main.hip) example for more fine-grained architecture control).
## Demonstrated API Calls
### HIP runtime
#### Device symbols
- `threadIdx`, `blockIdx`, `blockDim`
- `__HIP_PLATFORM_AMD__`, `__HIP_PLATFORM_NVIDIA__`
#### Host symbols
- `hipMalloc`
- `hipMemcpy`
- `hipLaunchKernelGGL`
- `HIP_KERNEL_NAME`
- `hipGetLastError`
- `hipFree`

25
HIP-Basic/inline_assembly/inline_assembly_vs2019.sln

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC940}") = "inline_assembly_vs2019", "inline_assembly_vs2019.vcxproj", "{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A7}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{7B7D1745-7635-40DA-B6AF-B8F728A31122}.Debug|x64.ActiveCfg = Debug|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31122}.Debug|x64.Build.0 = Debug|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31122}.Release|x64.ActiveCfg = Release|x64
{7B7D1745-7635-40DA-B6AF-B8F728A31122}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {50A6F0A7-FE4A-4B74-BE6E-1A354D8AD064}
EndGlobalSection
EndGlobal

99
HIP-Basic/inline_assembly/inline_assembly_vs2019.vcxproj

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{4e6b2034-d7ed-4cb4-98b2-7b2d2b71e0a7}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>inline_assembly_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

27
HIP-Basic/inline_assembly/inline_assembly_vs2019.vcxproj.filters

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{91fb42b0-13d7-42c2-9f9f-edead539556a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{dc73d4e0-b3d9-4216-9237-72e4a97ea387}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{ad5f5a22-1e00-4ee8-89fa-ec5047963ec0}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

145
HIP-Basic/inline_assembly/main.hip

@ -0,0 +1,145 @@ @@ -0,0 +1,145 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "example_utils.hpp"
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <cstddef>
#include <cstdlib>
/// \brief A simple matrix transpose kernel that's using inline assembly.
/// - The number of rows in the input and output matrices is equal, and given by the \p width parameter.
/// - Each thread in the grid is responsible for one element of the input and output matrices.
__global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
#ifdef __HIP_PLATFORM_AMD__
asm volatile("v_mov_b32_e32 %0, %1" : "=v"(out[x * width + y]) : "v"(in[y * width + x]));
#elif defined(__HIP_PLATFORM_NVIDIA__)
asm volatile("mov.f32 %0, %1;" : "=f"(out[x * width + y]) : "f"(in[y * width + x]));
#endif
}
// CPU implementation of matrix transpose
std::vector<float> matrix_transpose_reference(const std::vector<float>& input,
const unsigned int width)
{
std::vector<float> output(width * width);
for(unsigned int j = 0; j < width; j++)
{
for(unsigned int i = 0; i < width; i++)
{
output[i * width + j] = input[j * width + i];
}
}
return output;
}
int main()
{
// Number of rows and columns in the transposed square matrix.
constexpr unsigned int width = 1024;
// Number of threads in each kernel block along the X dimension.
constexpr unsigned int threads_per_block_x = 8;
// Number of threads in each kernel block along the Y dimension.
constexpr unsigned int threads_per_block_y = 8;
// Total element count of the transposed matrix.
constexpr unsigned int size = width * width;
// Total size (in bytes) of the transposed matrix.
constexpr size_t size_bytes = sizeof(float) * size;
// Allocate host vectors.
std::vector<float> h_matrix(size);
std::vector<float> h_transposed_matrix(size);
// Set up input data.
for(unsigned int i = 0; i < size; i++)
{
h_matrix[i] = i * 10.0f;
}
// Allocate device memory for the input and output matrices.
float* d_matrix{};
float* d_transposed_matrix{};
HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
// Transfer the input matrix to the device memory.
HIP_CHECK(hipMemcpy(d_matrix, h_matrix.data(), size_bytes, hipMemcpyHostToDevice));
// Lauching kernel from host.
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_transpose_kernel),
dim3(width / threads_per_block_x, width / threads_per_block_y),
dim3(threads_per_block_x, threads_per_block_y),
0,
hipStreamDefault,
d_transposed_matrix,
d_matrix,
width);
// Check if the kernel launch was successful.
HIP_CHECK(hipGetLastError());
// Transfer the result back to the host.
HIP_CHECK(hipMemcpy(h_transposed_matrix.data(),
d_transposed_matrix,
size_bytes,
hipMemcpyDeviceToHost));
// Free the resources on the device.
HIP_CHECK(hipFree(d_matrix));
HIP_CHECK(hipFree(d_transposed_matrix));
// Perform the reference (CPU) calculation.
std::vector<float> ref_transposed_matrix = matrix_transpose_reference(h_matrix, width);
// Check the results' validity.
constexpr float eps = 1.0E-6;
unsigned int errors{};
for(unsigned int i = 0; i < size; i++)
{
if(std::fabs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps)
{
errors++;
}
}
if(errors != 0)
{
std::cout << "Validation failed. Errors: " << errors << std::endl;
return error_exit_code;
}
else
{
std::cout << "Validation passed." << std::endl;
}
}

17
HIP-Basic/llvm_ir_to_executable/Makefile

@ -23,10 +23,9 @@ COMMON_INCLUDE_DIR := ../../Common @@ -23,10 +23,9 @@ COMMON_INCLUDE_DIR := ../../Common
GPU_RUNTIME ?= HIP
ifneq ($(GPU_RUNTIME), HIP)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
endif
# HIP variables
ROCM_INSTALL_DIR := /opt/rocm
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
@ -37,11 +36,11 @@ LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc @@ -37,11 +36,11 @@ LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc
CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD) $(CXXFLAGS)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR) $(CPPFLAGS)
ILDFLAGS := $(LDFLAGS)
ILDLIBS := $(LDLIBS)
# Compile for these GPU architectures
HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030
@ -60,7 +59,7 @@ GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amd @@ -60,7 +59,7 @@ GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amd
all: $(EXAMPLE)
$(EXAMPLE): main.o main_device.o
$(HIPCXX) -o $@ $^
$(HIPCXX) $(ILDFLAGS) -o $@ $^ $(ILDLIBS)
main_device.o: hip_obj_gen.mcin offload_bundle.hipfb
$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj
@ -73,7 +72,7 @@ offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) @@ -73,7 +72,7 @@ offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o)
-output=$@
main.o: main.hip
$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $<
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) -c --cuda-host-only $<
main_%.o: main_%.ll
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $<

50
HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln

@ -1,25 +1,25 @@ @@ -1,25 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0A13532C-E06B-4427-9847-54070C1E8622}
EndGlobalSection
EndGlobal

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.32630.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64
{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0A13532C-E06B-4427-9847-54070C1E8622}
EndGlobalSection
EndGlobal

366
HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj

@ -1,183 +1,183 @@ @@ -1,183 +1,183 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="hip_obj_gen_win.mcin">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
</CustomBuild>
<CustomBuild Include="main_gfx1030.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
</CustomBuild>
<CustomBuild Include="main_gfx803.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
</CustomBuild>
<CustomBuild Include="main_gfx900.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
</CustomBuild>
<CustomBuild Include="main_gfx906.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
</CustomBuild>
<CustomBuild Include="main_gfx908.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
</CustomBuild>
<CustomBuild Include="main_gfx90a.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
</CustomBuild>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{dbb8dfe9-cb1b-473c-937c-2a8120e0d819}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>llvm_ir_to_executable_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device LLVM IR %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device LLVM IR %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=NUL "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="hip_obj_gen_win.mcin">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
</CustomBuild>
<CustomBuild Include="main_gfx1030.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
</CustomBuild>
<CustomBuild Include="main_gfx803.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command>
</CustomBuild>
<CustomBuild Include="main_gfx900.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
</CustomBuild>
<CustomBuild Include="main_gfx906.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
</CustomBuild>
<CustomBuild Include="main_gfx908.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
</CustomBuild>
<CustomBuild Include="main_gfx90a.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
</CustomBuild>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{dbb8dfe9-cb1b-473c-937c-2a8120e0d819}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>llvm_ir_to_executable_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device LLVM IR %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
<CustomBuild>
<Message>Compiling Device LLVM IR %(Identity)</Message>
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
<Outputs>$(IntDir)%(FileName).o</Outputs>
</CustomBuild>
<CustomBuildStep>
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=NUL "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
</CustomBuildStep>
<CustomBuildStep>
<Message>Generating Device Offload Object</Message>
</CustomBuildStep>
<CustomBuildStep>
<Outputs>$(IntDIr)main_device.obj</Outputs>
</CustomBuildStep>
<CustomBuildStep>
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
</CustomBuildStep>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

106
HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters

@ -1,53 +1,53 @@ @@ -1,53 +1,53 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="main_gfx90a.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx803.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx900.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx906.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx908.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx90a.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx1030.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="hip_obj_gen_win.mcin">
<Filter>Source Files</Filter>
</CustomBuild>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="main_gfx90a.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx803.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx900.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx906.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx908.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx90a.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="main_gfx1030.ll">
<Filter>Source Files</Filter>
</CustomBuild>
<CustomBuild Include="hip_obj_gen_win.mcin">
<Filter>Source Files</Filter>
</CustomBuild>
</ItemGroup>
</Project>

2
HIP-Basic/llvm_ir_to_executable/main.hip

@ -31,7 +31,7 @@ @@ -31,7 +31,7 @@
/// \brief Device function to square each element
/// in the array `in` and write to array `out`.
template<typename T>
__global__ void vector_square_kernel(T* out, const T* in, const long long size)
__global__ void vector_square_kernel(T* out, const T* in, const unsigned long long size)
{
// Get the unique global thread ID
const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;

16
HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

16
HIP-Basic/llvm_ir_to_executable/main_gfx803.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx803" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx803" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

16
HIP-Basic/llvm_ir_to_executable/main_gfx900.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

16
HIP-Basic/llvm_ir_to_executable/main_gfx906.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

16
HIP-Basic/llvm_ir_to_executable/main_gfx908.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

16
HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll

@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa" @@ -8,6 +8,8 @@ target triple = "amdgcn-amd-amdhsa"
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
$_Z20vector_square_kernelIfEvPT_PKS0_y = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any @@ -22,8 +24,8 @@ $_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
; Function Attrs: mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_y(float addrspace(1)* nocapture writeonly %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 comdat {
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float @@ -57,15 +59,15 @@ define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float
}
; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.x() #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
attributes #1 = { nounwind readnone speculatable willreturn }
attributes #2 = { nounwind }
@ -76,7 +78,7 @@ attributes #2 = { nounwind } @@ -76,7 +78,7 @@ attributes #2 = { nounwind }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 1}
!2 = !{i32 2, i32 0}
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
!3 = !{!"AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"}
!4 = !{!5, !9, i64 12}
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
!6 = !{!"short", !7, i64 0}

26
HIP-Basic/matrix_multiplication/Makefile

@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include @@ -31,22 +31,28 @@ HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
# Common variables and flags
CXX_STD := c++17
CXXFLAGS := -std=$(CXX_STD)
CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
LDFLAGS :=
LDLIBS :=
CXX_STD := c++17
ICXXFLAGS := -std=$(CXX_STD)
ICPPFLAGS := -I $(COMMON_INCLUDE_DIR)
ILDFLAGS :=
ILDLIBS :=
ifeq ($(GPU_RUNTIME), CUDA)
CXXFLAGS += -x cu
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
ICXXFLAGS += -x cu
ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
else ifeq ($(GPU_RUNTIME), HIP)
CXXFLAGS ?= -Wall -Wextra
else
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
endif
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/cmdparser.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
ICXXFLAGS += $(CXXFLAGS)
ICPPFLAGS += $(CPPFLAGS)
ILDFLAGS += $(LDFLAGS)
ILDLIBS += $(LDLIBS)
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
clean:
$(RM) $(EXAMPLE)

202
HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj

@ -1,101 +1,101 @@ @@ -1,101 +1,101 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>matrix_multiplication_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level1</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.hip" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\Common\example_utils.hpp" />
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>matrix_multiplication_vs2019</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>HIP</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<TargetName>hip_$(ProjectName)</TargetName>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level2</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<LanguageStandard>stdcpp17</LanguageStandard>
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<RuntimeTypeInfo>true</RuntimeTypeInfo>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
</ImportGroup>
</Project>

2
HIP-Basic/module_api/.gitignore vendored

@ -0,0 +1,2 @@ @@ -0,0 +1,2 @@
hip_module_api
module.co

76
HIP-Basic/module_api/CMakeLists.txt

@ -0,0 +1,76 @@ @@ -0,0 +1,76 @@
# MIT License
#
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(example_name hip_module_api)
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(${example_name} LANGUAGES CXX)
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
# Only supported on HIP (not CUDA)
if(NOT "${GPU_RUNTIME}" STREQUAL "HIP")
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.")
message(FATAL_ERROR ${ERROR_MESSAGE})
endif()
enable_language(${GPU_RUNTIME})
set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
if(NOT CMAKE_PREFIX_PATH)
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
endif()
set(offload_archs ${CMAKE_HIP_ARCHITECTURES})
list(TRANSFORM offload_archs PREPEND "--offload-arch=")
set(module ${CMAKE_CURRENT_BINARY_DIR}/module.co)
set(module_sources ${CMAKE_CURRENT_SOURCE_DIR}/module.hip)
if(CMAKE_BUILD_TYPE EQUAL "Debug")
set(module_flags ${CMAKE_HIP_FLAGS} ${CMAKE_HIP_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE EQUAL "Release")
set(module_flags ${CMAKE_HIP_FLAGS} ${CMAKE_HIP_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE EQUAL "MinSizeRel")
set(module_flags ${CMAKE_HIP_FLAGS} ${CMAKE_HIP_FLAGS_MINSIZEREL})
elseif(CMAKE_BUILD_TYPE EQUAL "RelWithDebInfo")
set(module_flags ${CMAKE_HIP_FLAGS} ${CMAKE_HIP_FLAGS_RELWITHDEBINFO})
endif()
add_custom_command(
OUTPUT ${module}
COMMAND ${CMAKE_HIP_COMPILER} ${module_flags} ${module_sources} ${offload_archs} --cuda-device-only -o ${module}
DEPENDS ${module_sources}
COMMENT "Compiling HIP code object module.co"
)
add_custom_target(module ALL DEPENDS ${module})
add_executable(${example_name} main.hip)
# Make example runnable using ctest
add_test(${example_name} ${example_name})
set(include_dirs "../../Common")
target_include_directories(${example_name} PRIVATE ${include_dirs})
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save