diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a4d2732e..6dd911d8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -48,9 +48,10 @@ clang-format: script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - - scripts/code-format/check-format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" + - Scripts/CodeFormat/check_format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" .build:dockerfiles: + timeout: 20m image: name: gcr.io/kaniko-project/executor:debug entrypoint: [""] @@ -209,7 +210,7 @@ test:rocm-windows-cmake: -S "$CI_PROJECT_DIR" -B "$CI_PROJECT_DIR/build" -G Ninja - -D CMAKE_BUILD_TYPE="$CONFIG" + -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D CMAKE_HIP_ARCHITECTURES=gfx1030 -D CMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/10/bin/10.0.19041.0/x64/rc.exe" - cmake --build "$CI_PROJECT_DIR/build" diff --git a/Common/cmdparser.hpp b/Common/cmdparser.hpp new file mode 100644 index 00000000..a2a566b8 --- /dev/null +++ b/Common/cmdparser.hpp @@ -0,0 +1,768 @@ +// MIT License +// +// Copyright (c) 2015 - 2016 Florian Rappl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/* + This file is part of the C++ CmdParser utility. + Copyright (c) 2015 - 2019 Florian Rappl +*/ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace cli +{ +/// Class used to wrap integer types to specify desired numerical base for specific argument parsing +template +class NumericalBase +{ +public: + /// This constructor required for correct AgrumentCountChecker initialization + NumericalBase() : value(0), base(numericalBase) {} + + /// This constructor required for default value initialization + /// \param val comes from default value + NumericalBase(T val) : value(val), base(numericalBase) {} + + operator T() const + { + return this->value; + } + operator T*() + { + return this->value; + } + + T value; + unsigned int base; +}; + +struct CallbackArgs +{ + const std::vector& arguments; + std::ostream& output; + std::ostream& error; +}; +class Parser +{ +private: + class CmdBase + { + public: + explicit CmdBase(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant, + bool variadic) + : name(name) + , command(name.size() > 0 ? "-" + name : "") + , alternative(alternative.size() > 0 ? "--" + alternative : "") + , description(description) + , required(required) + , handled(false) + , arguments({}) + , dominant(dominant) + , variadic(variadic) + {} + + virtual ~CmdBase() {} + + std::string name; + std::string command; + std::string alternative; + std::string description; + bool required; + bool handled; + std::vector arguments; + bool const dominant; + bool const variadic; + + virtual std::string print_value() const = 0; + virtual bool parse(std::ostream& output, std::ostream& error) = 0; + + bool is(const std::string& given) const + { + return given == command || given == alternative; + } + }; + + template + struct ArgumentCountChecker + { + static constexpr bool Variadic = false; + }; + + template + struct ArgumentCountChecker> + { + static constexpr bool Variadic = false; + }; + + template + struct ArgumentCountChecker> + { + static constexpr bool Variadic = true; + }; + + template + class CmdFunction final : public CmdBase + { + public: + explicit CmdFunction(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + {} + + virtual bool parse(std::ostream& output, std::ostream& error) + { + try + { + CallbackArgs args{arguments, output, error}; + value = callback(args); + return true; + } + catch(...) + { + return false; + } + } + + virtual std::string print_value() const + { + return ""; + } + + std::function callback; + T value; + }; + + template + class CmdArgument final : public CmdBase + { + public: + explicit CmdArgument(const std::string& name, + const std::string& alternative, + const std::string& description, + bool required, + bool dominant) + : CmdBase(name, + alternative, + description, + required, + dominant, + ArgumentCountChecker::Variadic) + {} + + virtual bool parse(std::ostream&, std::ostream&) + { + try + { + value = Parser::parse(arguments, value); + return true; + } + catch(...) + { + return false; + } + } + + virtual std::string print_value() const + { + return stringify(value); + } + + T value; + }; + + static int parse(const std::vector& elements, const int&, int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stoi(elements[0], 0, numberBase); + } + + static bool parse(const std::vector& elements, const bool& defval) + { + if(elements.size() != 0) + throw std::runtime_error("A boolean command line parameter cannot have any arguments."); + + return !defval; + } + + static double parse(const std::vector& elements, const double&) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stod(elements[0]); + } + + static float parse(const std::vector& elements, const float&) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stof(elements[0]); + } + + static long double parse(const std::vector& elements, const long double&) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stold(elements[0]); + } + + static unsigned int + parse(const std::vector& elements, const unsigned int&, int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return static_cast(std::stoul(elements[0], 0, numberBase)); + } + + static unsigned long + parse(const std::vector& elements, const unsigned long&, int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stoul(elements[0], 0, numberBase); + } + + static unsigned long long parse(const std::vector& elements, + const unsigned long long&, + int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stoull(elements[0], 0, numberBase); + } + + static long long + parse(const std::vector& elements, const long long&, int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stoll(elements[0], 0, numberBase); + } + + static long parse(const std::vector& elements, const long&, int numberBase = 0) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return std::stol(elements[0], 0, numberBase); + } + + static std::string parse(const std::vector& elements, const std::string&) + { + if(elements.size() != 1) + throw std::bad_cast(); + + return elements[0]; + } + + template + static std::vector parse(const std::vector& elements, const std::vector&) + { + const T defval = T(); + std::vector values{}; + std::vector buffer(1); + + for(const auto& element : elements) + { + buffer[0] = element; + values.push_back(parse(buffer, defval)); + } + + return values; + } + + template + static T parse(const std::vector& elements, const NumericalBase& wrapper) + { + return parse(elements, wrapper.value, 0); + } + + /// Specialization for number wrapped into numerical base + /// \tparam T base type of the argument + /// \tparam base numerical base + /// \param elements + /// \param wrapper + /// \return parsed number + template + static T parse(const std::vector& elements, const NumericalBase& wrapper) + { + return parse(elements, wrapper.value, wrapper.base); + } + + template + static std::string stringify(const T& value) + { + return std::to_string(value); + } + + template + static std::string stringify(const NumericalBase& wrapper) + { + return std::to_string(wrapper.value); + } + + template + static std::string stringify(const std::vector& values) + { + std::stringstream ss{}; + ss << "[ "; + + for(const auto& value : values) + { + ss << stringify(value) << " "; + } + + ss << "]"; + return ss.str(); + } + + static std::string stringify(const std::string& str) + { + return str; + } + +public: + explicit Parser(int argc, const char** argv) : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); + } + enable_help(); + } + + explicit Parser(int argc, char** argv) : _appname(argv[0]) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); + } + enable_help(); + } + + Parser(int argc, const char** argv, std::string generalProgramDescriptionForHelpText) + : _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText)) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); + } + enable_help(); + } + + Parser(int argc, char** argv, std::string generalProgramDescriptionForHelpText) + : _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText)) + { + for(int i = 1; i < argc; ++i) + { + _arguments.push_back(argv[i]); + } + enable_help(); + } + + ~Parser() + { + for(size_t i = 0, n = _commands.size(); i < n; ++i) + { + delete _commands[i]; + } + } + + bool has_help() const + { + for(const auto& command : _commands) + { + if(command->name == "h" && command->alternative == "--help") + { + return true; + } + } + + return false; + } + + void enable_help() + { + set_callback("h", + "help", + std::function( + [this](CallbackArgs& args) + { + args.output << this->usage(); +#pragma warning(push) +#pragma warning(disable : 4702) + exit(0); + return false; +#pragma warning(pop) + }), + "", + true); + } + + void disable_help() + { + for(auto command = _commands.begin(); command != _commands.end(); ++command) + { + if((*command)->name == "h" && (*command)->alternative == "--help") + { + _commands.erase(command); + break; + } + } + } + + template + void set_default(bool is_required, const std::string& description = "") + { + auto command = new CmdArgument{"", "", description, is_required, false}; + _commands.push_back(command); + } + + template + void set_required(const std::string& name, + const std::string& alternative, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument{name, alternative, description, true, dominant}; + _commands.push_back(command); + } + + template + void set_optional(const std::string& name, + const std::string& alternative, + T defaultValue, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdArgument{name, alternative, description, false, dominant}; + command->value = defaultValue; + _commands.push_back(command); + } + + template + void set_callback(const std::string& name, + const std::string& alternative, + std::function callback, + const std::string& description = "", + bool dominant = false) + { + auto command = new CmdFunction{name, alternative, description, false, dominant}; + command->callback = callback; + _commands.push_back(command); + } + + inline void run_and_exit_if_error() + { + if(run() == false) + { + exit(1); + } + } + + inline bool run() + { + return run(std::cout, std::cerr); + } + + inline bool run(std::ostream& output) + { + return run(output, std::cerr); + } + + bool doesArgumentExist(std::string name, std::string altName) + { + for(const auto& argument : _arguments) + { + + if(argument == '-' + name || argument == altName) + { + return true; + } + } + + return false; + } + + inline bool doesHelpExist() + { + return doesArgumentExist("h", "--help"); + } + + bool run(std::ostream& output, std::ostream& error) + { + if(_arguments.size() > 0) + { + auto current = find_default(); + + for(size_t i = 0, n = _arguments.size(); i < n; ++i) + { + auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; + auto associated = isarg ? find(_arguments[i]) : nullptr; + + if(associated != nullptr) + { + current = associated; + associated->handled = true; + } + else if(current == nullptr) + { + error << no_default(); + return false; + } + else + { + current->arguments.push_back(_arguments[i]); + current->handled = true; + if(!current->variadic) + { + // If the current command is not variadic, then no more arguments + // should be added to it. In this case, switch back to the default + // command. + current = find_default(); + } + } + } + } + + // First, parse dominant arguments since they succeed even if required + // arguments are missing. + for(auto command : _commands) + { + if(command->handled && command->dominant && !command->parse(output, error)) + { + error << howto_use(command); + return false; + } + } + + // Next, check for any missing arguments. + for(auto command : _commands) + { + if(command->required && !command->handled) + { + error << howto_required(command); + return false; + } + } + + // Finally, parse all remaining arguments. + for(auto command : _commands) + { + if(command->handled && !command->dominant && !command->parse(output, error)) + { + error << howto_use(command); + return false; + } + } + + return true; + } + + template + T get(const std::string& name) const + { + for(const auto& command : _commands) + { + if(command->name == name) + { + auto cmd = dynamic_cast*>(command); + + if(cmd == nullptr) + { + throw std::runtime_error("Invalid usage of the parameter " + name + + " detected."); + } + + return cmd->value; + } + } + + throw std::runtime_error("The parameter " + name + " could not be found."); + } + + template + T get_if(const std::string& name, std::function callback) const + { + auto value = get(name); + return callback(value); + } + + int requirements() const + { + int count = 0; + + for(const auto& command : _commands) + { + if(command->required) + { + ++count; + } + } + + return count; + } + + int commands() const + { + return static_cast(_commands.size()); + } + + inline const std::string& app_name() const + { + return _appname; + } + +protected: + CmdBase* find(const std::string& name) + { + for(auto command : _commands) + { + if(command->is(name)) + { + return command; + } + } + + return nullptr; + } + + CmdBase* find_default() + { + for(auto command : _commands) + { + if(command->name == "") + { + return command; + } + } + + return nullptr; + } + + std::string usage() const + { + std::stringstream ss{}; + ss << _general_help_text << "\n\n"; + ss << "Available parameters:\n\n"; + + for(const auto& command : _commands) + { + ss << " " << command->command << "\t" << command->alternative; + + if(command->required == true) + { + ss << "\t(required)"; + } + + ss << "\n " << command->description; + + if(command->required == false) + { + ss << "\n " + << "This parameter is optional. The default value is '" + command->print_value() + << "'."; + } + + ss << "\n\n"; + } + + return ss.str(); + } + + void print_help(std::stringstream& ss) const + { + if(has_help()) + { + ss << "For more help use --help or -h.\n"; + } + } + + std::string howto_required(CmdBase* command) const + { + std::stringstream ss{}; + ss << "The parameter " << command->name << " is required.\n"; + ss << command->description << '\n'; + print_help(ss); + return ss.str(); + } + + std::string howto_use(CmdBase* command) const + { + std::stringstream ss{}; + ss << "The parameter " << command->name << " has invalid arguments.\n"; + ss << command->description << '\n'; + print_help(ss); + return ss.str(); + } + + std::string no_default() const + { + std::stringstream ss{}; + ss << "No default parameter has been specified.\n"; + ss << "The given argument must be used with a parameter.\n"; + print_help(ss); + return ss.str(); + } + + const std::string& get_general_help_text() const + { + return _general_help_text; + } + + void set_general_help_text(const std::string& generalHelpText) + { + _general_help_text = generalHelpText; + } + +private: + const std::string _appname; + std::string _general_help_text; + std::vector _arguments; + std::vector _commands; +}; +} // namespace cli \ No newline at end of file diff --git a/Common/example_utils.hpp b/Common/example_utils.hpp index c0c3f24a..9e555e50 100644 --- a/Common/example_utils.hpp +++ b/Common/example_utils.hpp @@ -99,4 +99,61 @@ std::string format_pairs(const BidirectionalIteratorT begin_a, return sstream.str(); } +/// \brief A function to parse a string for an int. If the string is a valid integer then return true +/// else if it has non-numeric character then return false. +bool parse_int_string(const std::string& str, int& out) +{ + try + { + size_t end; + int value = std::stoi(str, &end); + if(end == str.size()) + { + out = value; + return true; + } + return false; + } + catch(const std::exception&) + { + return false; + } +} + +/// \brief A class to measures time between intervals +class HostClock +{ +private: + std::chrono::steady_clock::time_point start_time; + std::chrono::steady_clock::duration elapsed_time; + +public: + HostClock() + { + this->reset_timer(); + } + + void reset_timer() + { + this->elapsed_time = std::chrono::steady_clock::duration(0); + } + + void start_timer() + { + this->start_time = std::chrono::steady_clock::now(); + } + void stop_timer() + { + const auto end_time = std::chrono::steady_clock::now(); + this->elapsed_time += end_time - this->start_time; + } + + /// @brief Returns time elapsed in Seconds + /// @return type double that contains the elapsed time in Seconds + double get_elapsed_time() const + { + return std::chrono::duration_cast>(this->elapsed_time) + .count(); + } +}; #endif // COMMON_EXAMPLE_UTILS_HPP diff --git a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile index c19f64ed..291824d9 100644 --- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile +++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile @@ -13,12 +13,17 @@ RUN export DEBIAN_FRONTEND=noninteractive; \ ssh \ sudo \ wget \ + pkg-config \ + glslang-tools \ + libvulkan-dev \ + vulkan-validationlayers \ + libglfw3-dev \ && rm -rf /var/lib/apt/lists/* # Install HIP using the installer script RUN export DEBIAN_FRONTEND=noninteractive; \ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \ - && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.2/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \ + && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \ && apt-get update -qq \ && apt-get install -y hip-base hipify-clang \ && apt-get download hip-runtime-nvidia hip-dev \ @@ -36,25 +41,25 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \ && ldconfig # Install rocRAND -RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.2.0.tar.gz \ - && tar -xf ./rocm-5.2.0.tar.gz \ - && rm ./rocm-5.2.0.tar.gz \ - && cmake -S ./rocRAND-rocm-5.2.0 -B ./rocRAND-rocm-5.2.0/build \ - -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \ +RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.3.0.tar.gz \ + && tar -xf ./rocm-5.3.0.tar.gz \ + && rm ./rocm-5.3.0.tar.gz \ + && cmake -S ./rocRAND-rocm-5.3.0 -B ./rocRAND-rocm-5.3.0/build \ + -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D BUILD_HIPRAND=OFF \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ - && cmake --build ./rocRAND-rocm-5.2.0/build --target install \ - && rm -rf ./rocRAND-rocm-5.2.0 + && cmake --build ./rocRAND-rocm-5.3.0/build --target install \ + && rm -rf ./rocRAND-rocm-5.3.0 # Install hipCUB -RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.2.0.tar.gz \ - && tar -xf ./rocm-5.2.0.tar.gz \ - && rm ./rocm-5.2.0.tar.gz \ - && cmake -S ./hipCUB-rocm-5.2.0 -B ./hipCUB-rocm-5.2.0/build \ - -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \ +RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.3.0.tar.gz \ + && tar -xf ./rocm-5.3.0.tar.gz \ + && rm ./rocm-5.3.0.tar.gz \ + && cmake -S ./hipCUB-rocm-5.3.0 -B ./hipCUB-rocm-5.3.0/build \ + -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -D CMAKE_INSTALL_PREFIX=/opt/rocm \ - && cmake --build ./hipCUB-rocm-5.2.0/build --target install \ - && rm -rf ./hipCUB-rocm-5.2.0 + && cmake --build ./hipCUB-rocm-5.3.0/build --target install \ + && rm -rf ./hipCUB-rocm-5.3.0 # Add the render group and a user with sudo permissions for the container RUN groupadd --system --gid 109 render \ diff --git a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile index ce57678e..9b247978 100644 --- a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile +++ b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile @@ -13,16 +13,21 @@ RUN export DEBIAN_FRONTEND=noninteractive; \ ssh \ sudo \ wget \ + pkg-config \ + glslang-tools \ + libvulkan-dev \ + vulkan-validationlayers \ + libglfw3-dev \ && rm -rf /var/lib/apt/lists/* ENV LANG en_US.utf8 # Install ROCM HIP and libraries using the installer script RUN export DEBIAN_FRONTEND=noninteractive; \ - wget https://repo.radeon.com/amdgpu-install/22.20/ubuntu/focal/amdgpu-install_22.20.50200-1_all.deb \ + wget https://repo.radeon.com/amdgpu-install/5.3/ubuntu/focal/amdgpu-install_5.3.50300-1_all.deb \ && apt-get update -qq \ - && apt-get install -y ./amdgpu-install_22.20.50200-1_all.deb \ - && rm ./amdgpu-install_22.20.50200-1_all.deb \ + && apt-get install -y ./amdgpu-install_5.3.50300-1_all.deb \ + && rm ./amdgpu-install_5.3.50300-1_all.deb \ && amdgpu-install -y --usecase=hiplibsdk --no-dkms \ && apt-get install -y libnuma-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/docs/CONTRIBUTING.md b/Docs/CONTRIBUTING.md similarity index 74% rename from docs/CONTRIBUTING.md rename to Docs/CONTRIBUTING.md index fcd45efa..70a8dcb0 100644 --- a/docs/CONTRIBUTING.md +++ b/Docs/CONTRIBUTING.md @@ -13,12 +13,16 @@ Every example has to be able to be built separately from the others, but also ha ## Code Format The formatting rules of the examples are enforced by `clang-format` using the `.clang-format` file in the top-level directory. -## Naming Conventions +## Variable Naming Conventions - Use `lower_snake_case` style to name variables and functions (e.g. block_size, multiply_kernel and multiply_host). - Use `PascalCase` for `class`, `struct`, `enum` and template argument definitions. -## Binary Naming Conventions -Use the prefix of the library for the name of the binary, so that there are no conflicts between libraries (e.g. hipcub_device_sum and rocprim_device_sum). +## File and Directory Naming Conventions +- Top-level directories use `PascalCase`. +- The directories in Libraries/ should use the exact name of the library they represent, including casing. If any directory does not represent a library (`exampleLibraryTemplate`), it should named in `camelCase`. +- Directories for individual examples use `snake_case`. +- Files generally use `snake_case`, with the exception of files for which an existing convention already applies (`README.md`, `LICENSE.md`, `CMakeLists.txt`, etc). +- Example binaries should be prefixed with the library name of the binary, so that there are no conflicts between libraries (e.g. `hipcub_device_sum` and `rocprim_device_sum`). ## Utilities Utility-functions (printing vectors, etc) and common error-handling code, that is used by all examples, should be moved to the common utility-header [example_utils.hpp](../Common/example_utils.hpp). diff --git a/HIP-Basic/CMakeLists.txt b/HIP-Basic/CMakeLists.txt index 959a1d22..34abda25 100644 --- a/HIP-Basic/CMakeLists.txt +++ b/HIP-Basic/CMakeLists.txt @@ -23,6 +23,13 @@ cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(HIP-Basic) +# Only supported on HIP (not CUDA) +if(NOT "${GPU_RUNTIME}" STREQUAL "CUDA") + add_subdirectory(assembly_to_executable) + add_subdirectory(llvm_ir_to_executable) +endif() + +add_subdirectory(bandwidth) add_subdirectory(device_query) add_subdirectory(dynamic_shared) add_subdirectory(events) @@ -32,5 +39,8 @@ if(NOT WIN32) endif() add_subdirectory(matrix_multiplication) add_subdirectory(occupancy) +add_subdirectory(runtime_compilation) add_subdirectory(saxpy) +add_subdirectory(shared_memory) add_subdirectory(streams) +add_subdirectory(warp_shuffle) diff --git a/HIP-Basic/Makefile b/HIP-Basic/Makefile index 0a07e38c..b04c0749 100644 --- a/HIP-Basic/Makefile +++ b/HIP-Basic/Makefile @@ -21,6 +21,7 @@ # SOFTWARE. EXAMPLES := \ + bandwidth \ device_query \ dynamic_shared \ events \ @@ -28,8 +29,18 @@ EXAMPLES := \ hipify \ matrix_multiplication \ occupancy \ + runtime_compilation \ saxpy \ - streams + shared_memory \ + streams \ + warp_shuffle + +# Only supported on HIP (not CUDA). +ifneq ($(GPU_RUNTIME), CUDA) + EXAMPLES += \ + assembly_to_executable \ + llvm_ir_to_executable +endif all: $(EXAMPLES) diff --git a/HIP-Basic/assembly_to_executable/.gitignore b/HIP-Basic/assembly_to_executable/.gitignore new file mode 100644 index 00000000..b5b29f16 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/.gitignore @@ -0,0 +1,3 @@ +hip_assembly_to_executable +*.o +*.hipfb diff --git a/HIP-Basic/assembly_to_executable/CMakeLists.txt b/HIP-Basic/assembly_to_executable/CMakeLists.txt new file mode 100644 index 00000000..e93c5f57 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/CMakeLists.txt @@ -0,0 +1,174 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_assembly_to_executable) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") + +# Only supported on HIP (not CUDA) +if(NOT "${GPU_RUNTIME}" STREQUAL "HIP") + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +if (NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for") +else() + set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for") +endif() + +if(GPU_ARCHITECTURES STREQUAL "all") + set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE) +endif() + +# Remove duplicates. +list(REMOVE_DUPLICATES GPU_ARCHITECTURES) +message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}") + +set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only") + +if (WIN32) + set(OBJ_TYPE obj) + set(NULDEV NUL) + set(HOST_TARGET x86_64-pc-windows-msvc) + set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin) +else() + set(OBJ_TYPE o) + set(NULDEV /dev/null) + set(HOST_TARGET x86_64-unknown-linux) + set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin) +endif() + +# Assemble the device assemblies to object files using the HIP compiler. +# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file +# for the right GPU. +foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) + message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") + add_custom_command( + OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE} + COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE} + ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s + -o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s + VERBATIM) +endforeach() + +# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool. +find_program( + OFFLOAD_BUNDLER_COMMAND clang-offload-bundler + PATH_SUFFIXES bin + PATHS + ${ROCM_ROOT}/llvm + ${CMAKE_INSTALL_PREFIX}/llvm + REQUIRED) + +if(OFFLOAD_BUNDLER_COMMAND) + message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}") +else() + message(FATAL_ERROR "clang-offload-bundler not found") +endif() + +# Generate object bundle. +# The invocation to generate is +# clang-offload-bundler -targets= -input= -inputs= ... -output= +# Note that the host target must be the first target present here, and it should have an empty input associated to it. + +# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},... +set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}") +# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ... +set(BUNDLE_INPUTS "-input=${NULDEV}") +# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} +set(BUNDLE_OBJECTS "") +foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) + set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}") + list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") + list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") +endforeach() + +# Invoke clang-offload-bundler to generate an offload bundle. +set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb") +add_custom_command( + OUTPUT "${BUNDLE}" + COMMAND + "${OFFLOAD_BUNDLER_COMMAND}" + -type=o + -bundle-align=4096 + "${BUNDLE_TARGETS}" + ${BUNDLE_INPUTS} + "-output=${BUNDLE}" + DEPENDS ${BUNDLE_OBJECTS} + VERBATIM) + +# Create the device binary by assembling the template that includes +# the offload bundle that was just generated using an .incbin directive. +# This needs an assembler. +find_program( + LLVM_MC_COMMAND llvm-mc + PATH_SUFFIXES bin + PATHS + ${ROCM_ROOT}/llvm + ${CMAKE_INSTALL_PREFIX}/llvm) + +if(LLVM_MC_COMMAND) + message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}") +else() + message(FATAL_ERROR "llvm-mc not found") +endif() + +# Invoke llvm-mc to generate an object file containing the offload bundle. +set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}") +add_custom_command( + OUTPUT "${DEVICE_OBJECT}" + COMMAND + "${LLVM_MC_COMMAND}" + -triple "${HOST_TARGET}" + "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}" + -o "${DEVICE_OBJECT}" + --filetype=obj + DEPENDS "${BUNDLE}" + VERBATIM) + +# Finally, create the executable. +add_executable( + ${example_name} + main.hip + ${DEVICE_OBJECT}) + +# Make example runnable using ctest. +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/assembly_to_executable/Makefile b/HIP-Basic/assembly_to_executable/Makefile new file mode 100644 index 00000000..3d26a352 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/Makefile @@ -0,0 +1,89 @@ +# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +EXAMPLE := hip_assembly_to_executable +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME ?= HIP + +ifneq ($(GPU_RUNTIME), HIP) +$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.) +endif + + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc +CLANG ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang +LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc +CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +# Compile for these GPU architectures +HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030 + +# If white-space is given as a literal the `subst` cannot recognize it. +# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS +# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER +# in the targets option. The targets option needs to be a single string with no spaces. +empty = +space = $(empty) $(empty) +comma = , + +GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES)) +GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%)) + +all: $(EXAMPLE) + +$(EXAMPLE): main.o main_device.o + $(HIPCXX) -o $@ $^ + +main_device.o: hip_obj_gen.mcin offload_bundle.hipfb + $(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj + +offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) + $(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \ + -targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \ + -input=/dev/null \ + $(^:%=-input=%) \ + -output=$@ + +main.o: main.hip + $(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $< + +main_%.o: main_%.s + $(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $< + +clean: + rm -f \ + main_*.o \ + offload_bundle.hipfb \ + main_device.o \ + main.o \ + $(EXAMPLE) + +.PHONY: clean $(EXAMPLE) diff --git a/HIP-Basic/assembly_to_executable/README.md b/HIP-Basic/assembly_to_executable/README.md new file mode 100644 index 00000000..f7307df9 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/README.md @@ -0,0 +1,115 @@ +# HIP-Basic Assembly to Executable Example + +## Description +This example shows how to manually compile and link a HIP application from device assembly. Pre-generated assembly files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable. + +Building HIP executables from device assembly can be useful for example to experiment with specific instructions, perform specific optimizations, or can help debugging. + +### Building + +- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma. + ```shell + make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" + ``` +- Build with CMake: + ```shell + cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" + cmake --build build + ``` + On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"` + +## Generating device assembly +This example creates a HIP file from device assembly code, however, such assembly files can also be created from HIP source code using `hipcc`. This can be done by passing `-S` and `--cuda-device-only` to hipcc. The former flag instructs the compiler to generate human-readable assembly instead of machine code, and the latter instruct the compiler to only compile the device part of the program. The six assembly files for this example were generated as follows: +```shell +$ROCM_INSTALL_DIR/bin/hipcc -S --cuda-device-only --offload-arch=gfx803 --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 main.hip +``` + +The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=:+` to enable it or `--offload-arch=:-` to disable it. Multiple features may be present, separated by colons. + +## Build Process +A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device assembly files and host HIP code according to the following process: + +1. The `main.hip` file is compiled to an object file that only contains host code with `hipcc` by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead. + ```shell + $ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip + ``` + +2. Each device assembly file is compiled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`: + + ```shell + $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.s -o main_gfx1030.o + $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu= main_.s -o main_.o + ... + ``` + +3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `--`. For HIP device code, `` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `` `host`. The order of targets and inputs must match. `` is an LLVM target triple, which is specified as `---`. `` is left empty for AMD targets. + + ```shell + $ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \ + -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \ + -input=/dev/null \ + -input=main_gfx1030.o -input=... \ + -output=offload_bundle.hipfb + ``` + + Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling to assembly. + +4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive: + ```nasm + .type __hip_fatbin,@object + ; Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"a",@progbits + ; Make the symbol that addresses the binary public + .globl __hip_fatbin + ; Give the bundle the required alignment + .p2align 12 + __hip_fatbin: + ; Include the binary + .incbin "offload_bundle.hipfb" + ``` + This file can then be assembled using `llvm-mc` as follows: + ``` + $ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple -o main_device.o hip_obj_gen.mcin --filetype=obj + ``` + +5. Finally, using the system linker, hipcc, or clang, the host object and device objects are linked into an executable: + ```shell + /hip/bin/hipcc -o hip_assembly_to_executable main.o main_device.o + ``` + +### Visual Studio 2019 +The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools: +- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`: + ``` + Additional Options: --cuda-host-only + ``` +- Each device assembly .s file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section: + ``` + Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + Description: Compiling Device Assembly %(Identity) + Output: $(IntDir)%(FileName).o + Execute Before: ClCompile + ``` +- Steps 3 and 4 are implemented using a custom build step: + ``` + Command Line: + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" + cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + Description: Generating Device Offload Object + Outputs: $(IntDIr)main_device.obj + Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + Execute Before: ClCompile + ``` +- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`: + ``` + Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies) + ``` + +## Used API surface +### HIP runtime +- `hipFree` +- `hipGetDeviceProperties` +- `hipGetLastError` +- `hipLaunchKernelGGL` +- `hipMalloc` +- `hipMemcpy` diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln new file mode 100644 index 00000000..1ceb39e1 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {5EAD9B5F-41B6-452E-922F-D5782C75EB8F} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj new file mode 100644 index 00000000..7783b217 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + --cuda-host-only + --cuda-host-only + + + + + + + + Document + copy %(Identity) "$(IntDir)%(Identity)" + Copying %(Identity) + $(IntDir)%(Identity) + copy %(Identity) "$(IntDir)%(Identity)" + Copying %(Identity) + $(IntDir)%(Identity) + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803 + + + Document + "$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900 + "$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + + + + 15.0 + {60b4ade0-8286-46ae-b884-5da51b541ded} + Win32Proj + assembly_to_executable_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + ClCompile + + + false + hip_$(ProjectName) + ClCompile + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + $(IntDir)main_device.obj;%(AdditionalDependencies) + + + Compiling Device Assembly %(Identity) + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa + $(IntDir)%(FileName).o + + + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" +cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + + + Generating Device Offload Object + + + $(IntDIr)main_device.obj + + + $(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + true + true + $(IntDir)main_device.obj;%(AdditionalDependencies) + + + Compiling Device Assembly %(Identity) + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa + $(IntDir)%(FileName).o + + + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" +cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + + + Generating Device Offload Object + + + $(IntDIr)main_device.obj + + + $(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + + + + + + + diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters new file mode 100644 index 00000000..205bad8d --- /dev/null +++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters @@ -0,0 +1,53 @@ + + + + + {4f2a1544-a556-4afb-b630-36ba54c0ab4a} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {b93521e0-9944-411a-9f6e-4071af6bc7ea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {972f07c3-b925-4516-bd65-2d5a3f626888} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + diff --git a/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin b/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin new file mode 100644 index 00000000..6b9fee5f --- /dev/null +++ b/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin @@ -0,0 +1,21 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# HIP Object Generator +# Use this generator to create a host bundled object file +# with the input of an offload bundled fat binary. +# +# Input: Bundled Object file .hipfb file +# Output: Host Bundled Object File .o + + .type __hip_fatbin,@object + # Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"a",@progbits + # Make the symbol that addresses the binary public. + .globl __hip_fatbin + # Give the bundle the required alignment of 4096 (2 ^ 12). + .p2align 12 +__hip_fatbin: + # Include the offload bundle. + .incbin "offload_bundle.hipfb" diff --git a/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin b/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin new file mode 100644 index 00000000..3636354e --- /dev/null +++ b/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin @@ -0,0 +1,20 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# HIP Object Generator +# Use this generator to create a host bundled object file +# with the input of an offload bundled fat binary. +# +# Input: Bundled Object file .hipfb file +# Output: Host Bundled Object File .o + + # Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"dw" + # Make the symbol that addresses the binary public. + .globl __hip_fatbin + # Give the bundle the required alignment of 4096 (2 ^ 12). + .p2align 12 +__hip_fatbin: + # Include the offload bundle. + .incbin "offload_bundle.hipfb" diff --git a/HIP-Basic/assembly_to_executable/main.hip b/HIP-Basic/assembly_to_executable/main.hip new file mode 100644 index 00000000..588fc070 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main.hip @@ -0,0 +1,118 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" + +#include + +#include +#include +#include + +/// \brief Device function to square each element +/// in the array `in` and write to array `out`. +template +__global__ void vector_square_kernel(T* out, const T* in, const long long size) +{ + // Get the unique global thread ID + const size_t offset = blockIdx.x * blockDim.x + threadIdx.x; + // Each thread hops stride amount of elements to find the next + // element to square + const size_t stride = blockDim.x * gridDim.x; + + for(size_t i = offset; i < size; i += stride) + { + out[i] = in[i] * in[i]; + } +} + +int main() +{ + // Set the problem size + constexpr size_t size = 1000000; + constexpr size_t size_in_bytes = size * sizeof(float); + + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/)); + std::cout << "info: running on device " << props.name << "\n"; + + std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) " + << "\n"; + + // Declare the host side arrays + std::vector h_in(size); + std::vector h_out(size); + + // Initialize the host size input + for(size_t i = 0; i < size; i++) + { + h_in[i] = 1.618f + i; + } + + // Declare the device side arrays + float *d_in, *d_out; + std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n"; + // Allocate the device side memory + HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); + HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); + + std::cout << "info: copy Host2Device\n"; + + // Copy the input from host to the GPU device + HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); + + // Set the number of blocks per kernel grid. + constexpr unsigned int grid_size = 512; + // Set the number of threads per kernel block. + constexpr unsigned int threads_per_block = 256; + + std::cout << "info: launch 'vector_square_kernel' kernel\n"; + hipLaunchKernelGGL(vector_square_kernel, + grid_size, + threads_per_block, + 0, + hipStreamDefault, + d_out, + d_in, + size); + + // Check that the kernel invocation was successful. + HIP_CHECK(hipGetLastError()); + + std::cout << "info: copy Device2Host\n"; + HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + + std::cout << "info: check result\n"; + for(size_t i = 0; i < size; i++) + { + if(h_out[i] != h_in[i] * h_in[i]) + { + std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i] + << ", expected: " << h_in[i] * h_in[i] << '\n'; + exit(error_exit_code); + } + } + std::cout << "PASSED!\n"; +} diff --git a/HIP-Basic/assembly_to_executable/main_gfx1030.s b/HIP-Basic/assembly_to_executable/main_gfx1030.s new file mode 100644 index 00000000..ce4952af --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx1030.s @@ -0,0 +1,219 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx1030" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[2:3], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_nc_u32_e32 v0, s8, v0 + v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] + s_and_saveexec_b32 s0, vcc_lo + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s8, s[4:5], 0xc + s_load_dwordx4 s[4:7], s[6:7], 0x0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b32 s9, 0 + s_mov_b32 s1, s9 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[10:11], s[8:9], 2 + .p2align 6 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_add_co_u32 v4, vcc_lo, s6, v2 + v_add_co_ci_u32_e32 v5, vcc_lo, s7, v3, vcc_lo + v_add_co_u32 v0, vcc_lo, v0, s8 + v_add_co_ci_u32_e32 v1, vcc_lo, s9, v1, vcc_lo + global_load_dword v6, v[4:5], off + v_add_co_u32 v4, vcc_lo, s4, v2 + v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo + v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1] + v_add_co_u32 v2, s0, v2, s10 + v_add_co_ci_u32_e64 v3, s0, s11, v3, s0 + s_or_b32 s1, vcc_lo, s1 + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + global_store_dword v[4:5], v6, off + s_andn2_b32 exec_lo, exec_lo, s1 + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_wavefront_size32 1 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 7 + .amdhsa_next_free_sgpr 12 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_workgroup_processor_mode 1 + .amdhsa_memory_ordered 1 + .amdhsa_forward_progress 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 188 +; NumSgprs: 14 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 14 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 16 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .text + .p2alignl 6, 3214868480 + .fill 48, 4, 3214868480 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 14 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 7 + .vgpr_spill_count: 0 + .wavefront_size: 32 +amdhsa.target: amdgcn-amd-amdhsa--gfx1030 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/assembly_to_executable/main_gfx803.s b/HIP-Basic/assembly_to_executable/main_gfx803.s new file mode 100644 index 00000000..7f9c7f3f --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx803.s @@ -0,0 +1,214 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx803" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[10:11], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, vcc, s8, v0 + v_cmp_gt_u64_e32 vcc, s[10:11], v[0:1] + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s8, s[4:5], 0xc + s_load_dwordx4 s[4:7], s[6:7], 0x0 + s_mov_b32 s9, 0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b64 s[14:15], 0 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[12:13], s[8:9], 2 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_mov_b32_e32 v5, s7 + v_add_u32_e32 v4, vcc, s6, v2 + v_addc_u32_e32 v5, vcc, v5, v3, vcc + flat_load_dword v6, v[4:5] + v_mov_b32_e32 v5, s5 + v_mov_b32_e32 v7, s9 + v_add_u32_e32 v0, vcc, s8, v0 + v_mov_b32_e32 v8, s13 + v_add_u32_e64 v4, s[0:1], s4, v2 + v_add_u32_e64 v2, s[2:3], s12, v2 + v_addc_u32_e64 v5, s[0:1], v5, v3, s[0:1] + v_addc_u32_e32 v1, vcc, v1, v7, vcc + v_addc_u32_e64 v3, vcc, v3, v8, s[2:3] + v_cmp_le_u64_e32 vcc, s[10:11], v[0:1] + s_or_b64 s[14:15], vcc, s[14:15] + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + flat_store_dword v[4:5], v6 + s_andn2_b64 exec, exec, s[14:15] + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 16 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 0 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 200 +; NumSgprs: 18 +; NumVgprs: 9 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 192 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 2 +; NumSGPRsForWavesPerEU: 18 +; NumVGPRsForWavesPerEU: 9 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 18 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 9 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx803 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/assembly_to_executable/main_gfx900.s b/HIP-Basic/assembly_to_executable/main_gfx900.s new file mode 100644 index 00000000..6ca519c1 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx900.s @@ -0,0 +1,216 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx900" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[12:13], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, s8, v0 + v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s14, s[4:5], 0xc + s_load_dwordx4 s[8:11], s[6:7], 0x0 + s_mov_b32 s15, 0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b64 s[6:7], 0 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[4:5], s[14:15], 2 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_mov_b32_e32 v5, s11 + v_add_co_u32_e32 v4, vcc, s10, v2 + v_addc_co_u32_e32 v5, vcc, v5, v3, vcc + global_load_dword v6, v[4:5], off + v_mov_b32_e32 v5, s9 + v_mov_b32_e32 v7, s15 + v_add_co_u32_e32 v0, vcc, s14, v0 + v_mov_b32_e32 v8, s5 + v_add_co_u32_e64 v4, s[0:1], s8, v2 + v_add_co_u32_e64 v2, s[2:3], s4, v2 + v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] + v_addc_co_u32_e32 v1, vcc, v1, v7, vcc + v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] + v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] + s_or_b64 s[6:7], vcc, s[6:7] + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + global_store_dword v[4:5], v6, off + s_andn2_b64 exec, exec, s[6:7] + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 16 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 200 +; NumSgprs: 18 +; NumVgprs: 9 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 2 +; NumSGPRsForWavesPerEU: 18 +; NumVGPRsForWavesPerEU: 9 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 18 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 9 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx900 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/assembly_to_executable/main_gfx906.s b/HIP-Basic/assembly_to_executable/main_gfx906.s new file mode 100644 index 00000000..2447c87b --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx906.s @@ -0,0 +1,216 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx906" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[12:13], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, s8, v0 + v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s14, s[4:5], 0xc + s_load_dwordx4 s[8:11], s[6:7], 0x0 + s_mov_b32 s15, 0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b64 s[6:7], 0 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[4:5], s[14:15], 2 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_mov_b32_e32 v5, s11 + v_add_co_u32_e32 v4, vcc, s10, v2 + v_addc_co_u32_e32 v5, vcc, v5, v3, vcc + global_load_dword v6, v[4:5], off + v_mov_b32_e32 v5, s9 + v_mov_b32_e32 v7, s15 + v_add_co_u32_e32 v0, vcc, s14, v0 + v_mov_b32_e32 v8, s5 + v_add_co_u32_e64 v4, s[0:1], s8, v2 + v_add_co_u32_e64 v2, s[2:3], s4, v2 + v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] + v_addc_co_u32_e32 v1, vcc, v1, v7, vcc + v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] + v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] + s_or_b64 s[6:7], vcc, s[6:7] + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + global_store_dword v[4:5], v6, off + s_andn2_b64 exec, exec, s[6:7] + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 16 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 200 +; NumSgprs: 18 +; NumVgprs: 9 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 2 +; NumSGPRsForWavesPerEU: 18 +; NumVGPRsForWavesPerEU: 9 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 18 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 9 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx906 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/assembly_to_executable/main_gfx908.s b/HIP-Basic/assembly_to_executable/main_gfx908.s new file mode 100644 index 00000000..851f0a89 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx908.s @@ -0,0 +1,218 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx908" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[12:13], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, s8, v0 + v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s14, s[4:5], 0xc + s_load_dwordx4 s[8:11], s[6:7], 0x0 + s_mov_b32 s15, 0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b64 s[6:7], 0 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[4:5], s[14:15], 2 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_mov_b32_e32 v5, s11 + v_add_co_u32_e32 v4, vcc, s10, v2 + v_addc_co_u32_e32 v5, vcc, v5, v3, vcc + global_load_dword v6, v[4:5], off + v_mov_b32_e32 v5, s9 + v_mov_b32_e32 v7, s15 + v_add_co_u32_e32 v0, vcc, s14, v0 + v_mov_b32_e32 v8, s5 + v_add_co_u32_e64 v4, s[0:1], s8, v2 + v_add_co_u32_e64 v2, s[2:3], s4, v2 + v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] + v_addc_co_u32_e32 v1, vcc, v1, v7, vcc + v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] + v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] + s_or_b64 s[6:7], vcc, s[6:7] + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + global_store_dword v[4:5], v6, off + s_andn2_b64 exec, exec, s[6:7] + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 16 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 200 +; NumSgprs: 18 +; NumVgprs: 9 +; NumAgprs: 0 +; TotalNumVgprs: 9 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 2 +; NumSGPRsForWavesPerEU: 18 +; NumVGPRsForWavesPerEU: 9 +; Occupancy: 10 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 18 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 9 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx908 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/assembly_to_executable/main_gfx90a.s b/HIP-Basic/assembly_to_executable/main_gfx90a.s new file mode 100644 index 00000000..85575938 --- /dev/null +++ b/HIP-Basic/assembly_to_executable/main_gfx90a.s @@ -0,0 +1,226 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx90a" + .protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x + .globl _Z20vector_square_kernelIfEvPT_PKS0_x + .p2align 8 + .type _Z20vector_square_kernelIfEvPT_PKS0_x,@function +_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x +; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dwordx2 s[12:13], s[6:7], 0x10 + v_mov_b32_e32 v1, 0 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, s8, v0 + v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz BB0_3 +; %bb.1: + s_load_dword s14, s[4:5], 0xc + s_load_dwordx4 s[8:11], s[6:7], 0x0 + s_mov_b32 s15, 0 + v_lshlrev_b64 v[2:3], 2, v[0:1] + s_mov_b64 s[6:7], 0 + s_waitcnt lgkmcnt(0) + s_lshl_b64 s[4:5], s[14:15], 2 +BB0_2: ; =>This Inner Loop Header: Depth=1 + v_mov_b32_e32 v5, s11 + v_add_co_u32_e32 v4, vcc, s10, v2 + v_addc_co_u32_e32 v5, vcc, v5, v3, vcc + global_load_dword v6, v[4:5], off + v_mov_b32_e32 v5, s9 + v_mov_b32_e32 v7, s15 + v_add_co_u32_e32 v0, vcc, s14, v0 + v_mov_b32_e32 v8, s5 + v_add_co_u32_e64 v4, s[0:1], s8, v2 + v_add_co_u32_e64 v2, s[2:3], s4, v2 + v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] + v_addc_co_u32_e32 v1, vcc, v1, v7, vcc + v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] + v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] + s_or_b64 s[6:7], vcc, s[6:7] + s_waitcnt vmcnt(0) + v_mul_f32_e32 v6, v6, v6 + global_store_dword v[4:5], v6, off + s_andn2_b64 exec, exec, s[6:7] + s_cbranch_execnz BB0_2 +BB0_3: + s_endpgm + .section .rodata,#alloc + .p2align 6 + .amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 80 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 16 + .amdhsa_accum_offset 12 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_tg_split 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 200 +; NumSgprs: 18 +; NumVgprs: 9 +; NumAgprs: 0 +; TotalNumVgprs: 9 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 2 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 18 +; NumVGPRsForWavesPerEU: 9 +; AccumOffset: 12 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2 +; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0 + .text + .p2alignl 6, 3212836864 + .fill 256, 4, 3212836864 + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE +_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE +_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 + + .protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object + .section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc + .weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE +_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: + .zero 1 + .size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 + + .ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" + .section ".note.GNU-stack" + .addrsig + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE + .addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE + .amdgpu_metadata +--- +amdhsa.kernels: + - .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .offset: 16 + .size: 8 + .value_kind: by_value + - .offset: 24 + .size: 8 + .value_kind: hidden_global_offset_x + - .offset: 32 + .size: 8 + .value_kind: hidden_global_offset_y + - .offset: 40 + .size: 8 + .value_kind: hidden_global_offset_z + - .address_space: global + .offset: 48 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 56 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: hidden_none + - .address_space: global + .offset: 72 + .size: 8 + .value_kind: hidden_multigrid_sync_arg + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 80 + .language: OpenCL C + .language_version: + - 2 + - 0 + .max_flat_workgroup_size: 1024 + .name: _Z20vector_square_kernelIfEvPT_PKS0_x + .private_segment_fixed_size: 0 + .sgpr_count: 18 + .sgpr_spill_count: 0 + .symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd + .vgpr_count: 9 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx90a +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata diff --git a/HIP-Basic/bandwidth/.gitignore b/HIP-Basic/bandwidth/.gitignore new file mode 100644 index 00000000..d69da8d5 --- /dev/null +++ b/HIP-Basic/bandwidth/.gitignore @@ -0,0 +1 @@ +hip_bandwidth diff --git a/HIP-Basic/bandwidth/CMakeLists.txt b/HIP-Basic/bandwidth/CMakeLists.txt new file mode 100644 index 00000000..3d319b43 --- /dev/null +++ b/HIP-Basic/bandwidth/CMakeLists.txt @@ -0,0 +1,56 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_bandwidth) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) +set(include_dirs "../../Common") +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/bandwidth/Makefile b/HIP-Basic/bandwidth/Makefile new file mode 100644 index 00000000..6c821f21 --- /dev/null +++ b/HIP-Basic/bandwidth/Makefile @@ -0,0 +1,54 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := hip_bandwidth +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + CXXFLAGS += -x cu + CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) +else +$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp + $(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/HIP-Basic/bandwidth/README.md b/HIP-Basic/bandwidth/README.md new file mode 100644 index 00000000..31bbba35 --- /dev/null +++ b/HIP-Basic/bandwidth/README.md @@ -0,0 +1,28 @@ +# Cookbook Bandwidth Example + +## Description +This example measures the memory bandwith capacity of GPU devices. It performs memcpy from host to GPU device, GPU device to host, and within a single GPU. + +### Application flow +1. User commandline arguments are parsed and test parameters initialized. If there are no commandline arguments then the test paramenters are initialized with default values. +2. Bandwidth tests are launched. +3. If the memory type for the test set to `-memory pageable` then the host side data is instantiated in `std::vector`. If the memory type for the test set to `-memory pinned` then the host side data is instantiated in `unsigned char*` and allocated using `hipHostMalloc`. +4. Device side storage is allocated using `hipMalloc` in `unsigned char*` +5. Memory transfer is performed `trail` amount of times using `hipMemcpy` for pageable memory or using `hipMemcpyAsync` for host allocated pinned memory. +6. Time of memory transfer operations is measured that is then used to calculate the bandwidth. +9. All device memory is freed using `hipFree` and all host allocated pinned memory is freed using `hipHostFree`. + +## Key APIs and Concepts +The program uses HIP pageable and pinned memory. It is important to note that the pinned memory is allocated using `hipHostMalloc` and is destroyed using `hipHostFree`. The HIP memory transfer routine `hipMemcpyAsync` will behave synchronously if the host memory is not pinned. Therefore, it is important to allocate pinned host memory using `hipHostMalloc` for `hipMemcpyAsync` to behave asynchronously. + +## Demonstrated API Calls +### HIP runtime +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyAsync` +- `hipGetDeviceCount` +- `hipGetDeviceProperties` +- `hipFree` +- `hipHostFree` +- `hipHostMalloc` +- `hipSetDevice` diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.sln b/HIP-Basic/bandwidth/bandwidth_vs2019.sln new file mode 100644 index 00000000..09016afe --- /dev/null +++ b/HIP-Basic/bandwidth/bandwidth_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {1E2ACB7F-1706-491A-9E62-395C1BD8E637} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj new file mode 100644 index 00000000..3283ff1b --- /dev/null +++ b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj @@ -0,0 +1,102 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + 15.0 + {16b11b54-cd72-43b6-b226-38c668b41a79} + Win32Proj + bandwidth_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters new file mode 100644 index 00000000..7dc35f68 --- /dev/null +++ b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {c71d9db2-bf13-49ee-b794-626d24391150} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {520f4985-c9bd-4add-9485-049fafe0cdca} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {006f799a-d711-49a7-93da-7f60d8872b02} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/HIP-Basic/bandwidth/main.hip b/HIP-Basic/bandwidth/main.hip new file mode 100644 index 00000000..56d127ae --- /dev/null +++ b/HIP-Basic/bandwidth/main.hip @@ -0,0 +1,637 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "cmdparser.hpp" +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include + +// Paged or pinned host memory +enum class MemoryMode : unsigned int +{ + PAGED, + PINNED +}; + +// Test either ranges of inputs sizes with a constant increament +// or a more complex shmoo test that tests bandwidth for large number of varying sizes. +enum class TestMode : unsigned int +{ + RANGED, + SHMOO +}; + +/// \brief Run host to device or device to host transfer, bandwidth calculated for the specified configuration +std::vector + run_bandwidth_host_device(const std::vector& memory_copy_measurement_sizes, + const int device, + hipMemcpyKind hip_memcpy_kind, + const MemoryMode memory_mode, + const unsigned int trails) +{ + + // Check for invalid configurations + if(hip_memcpy_kind == hipMemcpyDeviceToDevice) + { + std::cerr << "hipMemcpyDeviceToDevice is an invalid Configuration\n"; + exit(error_exit_code); + } + + // The bandwidths calculated will be stored in bandwidth_measurements + std::vector bandwidth_measurements; + + // Flush buffer for CPU cache + constexpr size_t flush_size = 256 * 1024 * 1024; + std::vector flush_buffer(flush_size); + + HIP_CHECK(hipSetDevice(device)); + + if(hip_memcpy_kind == hipMemcpyHostToDevice) + { + std::cout << "Measuring Host to Device Bandwidth: " << std::flush; + } + else + { + std::cout << "Measuring Device to Host Bandwidth: " << std::flush; + } + + for(auto size : memory_copy_measurement_sizes) + { + std::cout << "[" << size << "] " << std::flush; + + // Blocks used to clear host cache + const unsigned long long cache_clear_size = 1 << 24; + std::vector h_cache_block_1(cache_clear_size); + std::vector h_cache_block_2(cache_clear_size); + + // Size in bytes + const size_t size_in_bytes = sizeof(unsigned char) * size; + + // Allocate device input memory + unsigned char* d_in = nullptr; + HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); + + // Memory transfer from host to device + if(memory_mode == MemoryMode::PAGED) + { + // Host input memory + std::vector h_in(size); + + // Host output memory + std::vector h_out(size); + + // Initialize the host input memory + for(unsigned int i = 0; i < size; i++) + { + h_in[i] = static_cast(i & 0xff); + } + + unsigned char* src = nullptr; + unsigned char* dst = nullptr; + + switch(hip_memcpy_kind) + { + case hipMemcpyHostToDevice: + // Set the source and destination for hipMemcpy + src = h_in.data(); + dst = d_in; + break; + case hipMemcpyDeviceToHost: + // Transfer the host input to device + HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); + + // Set the source and destination for hipMemcpy + src = d_in; + dst = h_out.data(); + break; + default: + std::cerr << "Invalid memcpy kind " << hip_memcpy_kind << "! \n"; + exit(error_exit_code); + } + + // Fill the host cache clear buffers + for(unsigned int i = 0; i < h_cache_block_1.size(); i++) + { + h_cache_block_1[i] = static_cast(i & 0xff); + h_cache_block_2[i] = static_cast(0xff - (i & 0xff)); + } + + // Timer class + HostClock host_clock; + + // Perform memory transfers warm up + for(unsigned int i = 0; i < 5; i++) + { + // Initiate the memory transfer + HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind)); + + // Flush the buffer + memset(flush_buffer.data(), i, flush_buffer.size()); + } + + // Perform memory transfers for trails number of times + for(unsigned int i = 0; i < trails; i++) + { + host_clock.start_timer(); + + // Initiate the memory transfer + HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind)); + + host_clock.stop_timer(); + + // Flush the buffer + memset(flush_buffer.data(), i, flush_buffer.size()); + } + // Calculate the bandwith in GB/s + const double bandwidth_achieved + = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); + + bandwidth_measurements.emplace_back(bandwidth_achieved); + } + else if(memory_mode == MemoryMode::PINNED) // Pinned memory mode + { + // Host input memory + unsigned char* h_in = nullptr; + + // Host output memory + unsigned char* h_out = nullptr; + + HIP_CHECK(hipHostMalloc(&h_in, size_in_bytes)); + HIP_CHECK(hipHostMalloc(&h_out, size_in_bytes)); + + // Initialize the host memory + for(unsigned int i = 0; i < size; i++) + { + h_in[i] = static_cast(i & 0xff); + } + + unsigned char* src = nullptr; + unsigned char* dst = nullptr; + + if(hip_memcpy_kind == hipMemcpyHostToDevice) + { + // Set the source and destination for hipMemcpy + src = h_in; + dst = d_in; + } + else if(hip_memcpy_kind == hipMemcpyDeviceToHost) + { + // Transfer the host input to device + HIP_CHECK(hipMemcpyAsync(d_in, h_in, size_in_bytes, hip_memcpy_kind)); + HIP_CHECK(hipDeviceSynchronize()); + + // Set the source and destination for hipMemcpy + src = d_in; + dst = h_out; + } + + // Perform memory transfers warm up + for(unsigned int i = 0; i < 5; i++) + { + HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind)); + } + HIP_CHECK(hipDeviceSynchronize()); + + HostClock host_clock; + host_clock.start_timer(); + + // Initiate the memory transfer + // Perform memory transfers for trails number of times + for(unsigned int i = 0; i < trails; i++) + { + HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind)); + } + + HIP_CHECK(hipDeviceSynchronize()); + + host_clock.stop_timer(); + + // Calculate the bandwith in GB/s + const double bandwidth_achieved + = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); + + bandwidth_measurements.emplace_back(bandwidth_achieved); + + HIP_CHECK(hipHostFree(h_in)); + HIP_CHECK(hipHostFree(h_out)); + } + + // Free the memory + HIP_CHECK(hipFree(d_in)); + } + std::cout << std::endl; + + return bandwidth_measurements; +} + +/// \brief Run device to device transfer, bandwidth calculated for the specified configuration +std::vector + run_bandwidth_device_device(std::vector memory_copy_measurement_sizes, + const int device, + const unsigned int trails) +{ + + // The bandwidths calculated will be stored in bandwidth_measurements + std::vector bandwidth_measurements; + + HIP_CHECK(hipSetDevice(device)); + + std::cout << "Measuring Device to Device Bandwith: " << std::flush; + for(auto size : memory_copy_measurement_sizes) + { + std::cout << "[" << size << "] " << std::flush; + + // Size in bytes + const size_t size_in_bytes = sizeof(unsigned char) * size; + + // Allocate device input memory + unsigned char* d_in = nullptr; + HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); + + // Device output memory. + unsigned char* d_out = nullptr; + HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); + + // Host input memory + std::vector h_in(size); + + // Initialize the host input memory + for(unsigned int i = 0; i < size; i++) + { + h_in[i] = static_cast(i & 0xff); + } + + // Transfer the host input to device + HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); + + // Set the source and destination for hipMemcpy + unsigned char* src = d_in; + unsigned char* dst = d_out; + + // Perform memory transfers warm up + for(unsigned int i = 0; i < 5; i++) + { + // Initiate the memory transfer + HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice)); + } + + // Synchronize because the device to device memory copy is non-blocking + HIP_CHECK(hipDeviceSynchronize()); + + // Timer class + HostClock host_clock; + host_clock.start_timer(); + + // Perform memory transfers for trails number of times + for(unsigned int i = 0; i < trails; i++) + { + // Initiate the memory transfer + HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice)); + } + HIP_CHECK(hipDeviceSynchronize()); + + host_clock.stop_timer(); + + // Calculate the bandwith in GB/s + const double bandwidth_achieved + = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); + + bandwidth_measurements.emplace_back(bandwidth_achieved); + + // Free the device output memory + HIP_CHECK(hipFree(d_out)); + + // Free the memory + HIP_CHECK(hipFree(d_in)); + } + std::cout << std::endl; + + return bandwidth_measurements; +} + +std::vector + generate_measurement_sizes_range(const size_t start_measurement, + const size_t end_measurement, + const size_t stride_between_measurements) +{ + // The size of data to copy for each measurement + std::vector memory_copy_measurement_sizes; + + for(size_t i = start_measurement; i < end_measurement; i += stride_between_measurements) + { + memory_copy_measurement_sizes.emplace_back(i); + } + + return memory_copy_measurement_sizes; +} + +std::vector generate_measurement_sizes_shmoo() +{ + + // Constants for shmoo mode + const size_t shmoo_memsize_max = 1 << 26; // 64 MB + + const size_t shmoo_increment_1KB = 1 << 10; // 1 KB + const size_t shmoo_increment_2KB = 1 << 11; // 2 KB + const size_t shmoo_increment_10KB = shmoo_increment_1KB * 10; // 10KB + const size_t shmoo_increment_100KB = shmoo_increment_10KB * 10; // 100 KB + const size_t shmoo_increment_1MB = 1 << 20; // 1 MB + const size_t shmoo_increment_2MB = 1 << 21; // 2 MB + const size_t shmoo_increment_4MB = 1 << 22; // 4 MB + + const size_t shmoo_limit_20KB = shmoo_increment_10KB * 2; // 20 KB + const size_t shmoo_limit_50KB = shmoo_increment_10KB * 5; // 50 KB + const size_t shmoo_limit_100KB = shmoo_increment_10KB * 10; // 100 KB + const size_t shmoo_limit_1MB = 1 << 20; // 1 MB + const size_t shmoo_limit_16MB = 1 << 24; // 16 MB + const size_t shmoo_limit_32MB = 1 << 25; // 32 MB + + // The size of data to copy for each measurement + std::vector memory_copy_measurement_sizes; + + size_t current_size = 0; + + while(current_size <= shmoo_memsize_max) + { + if(current_size < shmoo_limit_20KB) + { + current_size += shmoo_increment_1KB; + } + else if(current_size < shmoo_limit_50KB) + { + current_size += shmoo_increment_2KB; + } + else if(current_size < shmoo_limit_100KB) + { + current_size += shmoo_increment_10KB; + } + else if(current_size < shmoo_limit_1MB) + { + current_size += shmoo_increment_100KB; + } + else if(current_size < shmoo_limit_16MB) + { + current_size += shmoo_increment_1MB; + } + else if(current_size < shmoo_limit_32MB) + { + current_size += shmoo_increment_2MB; + } + else + { + current_size += shmoo_increment_4MB; + } + memory_copy_measurement_sizes.emplace_back(current_size); + } + + return memory_copy_measurement_sizes; +} + +void configure_parser(cli::Parser& parser) +{ + // Default parameters + parser.set_optional("start", "start", 1 << 20, "Starting size"); // Default 1 MB + parser.set_optional("end", "end", 1 << 23, "Ending size"); // Default 8 MB + parser.set_optional("stride", + "stride", + 1 << 22, // Default 4 MB + "Stride (or increament) between sizes"); + + parser.set_optional("mode", + "mode", + "range", + "Mode of bandwidth test: range or shmoo"); + parser.set_optional("memory", + "memory", + "pageable", + "Memory allocation kind: pageable or pinned\n"); + parser.set_optional("trials", "trials", 50, "Number of trials"); + parser.set_optional>( + "device", + "device", + {"0"}, + "Space-separated list of devices\n" + "\tall for using all the available devices\n" + "\t0,1,2,...,n for using any particular available devices"); + parser.set_optional>("memcpy", + "memcpy", + {"htod", "dtoh", "dtod"}, + "Space-separated list of memory copy kind.\n" + "\thtod is host to device\n" + "\tdtoh is device to host\n" + "\tdtod is device to device"); +} + +int main(int argc, char** argv) +{ + + // Get the number of hip devices in the system + int number_of_devices = 0; + HIP_CHECK(hipGetDeviceCount(&number_of_devices)) + + if(number_of_devices <= 0) + { + std::cerr << "HIP supported devices not found!" + << "\n"; + exit(error_exit_code); + } + + // Parse user inputs + cli::Parser parser(argc, argv); + configure_parser(parser); + parser.run_and_exit_if_error(); + + // Set configurations for testing bandwidth + const size_t trials = parser.get("trials"); + const size_t start_measurement = parser.get("start"); + const size_t end_measurement = parser.get("end"); + const size_t stride_between_measurements = parser.get("stride"); + const std::string mode = parser.get("mode"); + const std::string memory_cmd = parser.get("memory"); + const std::vector devices_cmd = parser.get>("device"); + const std::vector memcpy_cmd = parser.get>("memcpy"); + + // Set the mode of bandwidth test: RANGED or SHMOO + TestMode mode_of_test; + + if(mode == "range") + { + mode_of_test = TestMode::RANGED; + } + else if(mode == "shmoo") + { + mode_of_test = TestMode::SHMOO; + } + else + { + std::cerr << "Invalid mode " << mode << "! \n"; + exit(error_exit_code); + } + + // Set the memory host allocation type: PAGED or PINNED + MemoryMode memory_allocation; + if(memory_cmd == "pageable") + { + memory_allocation = MemoryMode::PAGED; + } + else if(memory_cmd == "pinned") + { + memory_allocation = MemoryMode::PINNED; + } + else + { + std::cerr << "Invalid memory allocation " << memory_cmd << "! \n"; + exit(error_exit_code); + } + + // Store device ids + std::vector devices; + if(std::find(devices_cmd.begin(), devices_cmd.end(), "all") != devices_cmd.end()) + { + devices = std::vector(number_of_devices); + + // Initialize the default device ids + std::iota(devices.begin(), devices.end(), 0); + } + else + { + for(const std::string& device : devices_cmd) + { + int device_id; + if(!parse_int_string(device, device_id)) + { + std::cerr << "Invalid device ID " << device << "!\n"; + exit(error_exit_code); + } + + if(device_id < 0 || device_id >= number_of_devices) + { + std::cerr << "Invalid device id " << device << "!\n" + << "Device does not exist\n"; + exit(error_exit_code); + } + devices.emplace_back(device_id); + } + } + + std::cout << "Devices: " << format_range(devices.begin(), devices.end()) << "\n"; + + // Set hipMemcpyKind + std::map memcpy_kinds; + if(std::find(memcpy_cmd.begin(), memcpy_cmd.end(), "all") != memcpy_cmd.end()) + { + memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"}); + memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"}); + memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"}); + } + else + { + for(std::string memcpy : memcpy_cmd) + { + if(memcpy == "htod") + { + memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"}); + } + else if(memcpy == "dtoh") + { + memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"}); + } + else if(memcpy == "dtod") + { + memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"}); + } + else + { + std::cerr << "Invalid memcpy!" + << "\n"; + exit(error_exit_code); + } + } + } + + std::vector memory_copy_measurement_sizes; + if(mode_of_test == TestMode::RANGED) + { + memory_copy_measurement_sizes + = generate_measurement_sizes_range(start_measurement, + end_measurement, + stride_between_measurements); + } + else + { + memory_copy_measurement_sizes = generate_measurement_sizes_shmoo(); + } + + std::cout << "Measurement Sizes: " + << format_range(memory_copy_measurement_sizes.begin(), + memory_copy_measurement_sizes.end()) + << "\n\n"; + + // Run the bandwidth tests on devices + for(auto device : devices) + { + hipDeviceProp_t devProp; + HIP_CHECK(hipSetDevice(device)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device)); + + for(auto memcpy_kind : memcpy_kinds) + { + std::string print_text; + if(memory_allocation == MemoryMode::PAGED) + { + print_text = "Paged Bandwidth "; + } + else if(memory_allocation == MemoryMode::PINNED) + { + print_text = "Pinned Bandwidth "; + } + if(memcpy_kind.first == hipMemcpyDeviceToDevice) + { + print_text = "Bandwidth "; + } + + std::vector bandwidth_measurements; + if(memcpy_kind.first == hipMemcpyDeviceToDevice) + { + bandwidth_measurements + = run_bandwidth_device_device(memory_copy_measurement_sizes, device, trials); + } + else + { + bandwidth_measurements = run_bandwidth_host_device(memory_copy_measurement_sizes, + device, + memcpy_kind.first, + memory_allocation, + trials); + } + std::cout << "\nDevice ID [" << device << "] Device Name [" << devProp.name + << "]: " << print_text << memcpy_kind.second << " (GB/s): " + << format_range(bandwidth_measurements.begin(), bandwidth_measurements.end()) + << "\n\n"; + } + } +} diff --git a/HIP-Basic/device_query/device_query_vs2019.sln b/HIP-Basic/device_query/device_query_vs2019.sln index 9297291b..7dc7482a 100644 --- a/HIP-Basic/device_query/device_query_vs2019.sln +++ b/HIP-Basic/device_query/device_query_vs2019.sln @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.32630.194 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_template_vs2019", "example_template_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "device_query_vs2019", "device_query_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/HIP-Basic/device_query/device_query_vs2019.vcxproj b/HIP-Basic/device_query/device_query_vs2019.vcxproj index 980cf88c..841fe5b5 100644 --- a/HIP-Basic/device_query/device_query_vs2019.vcxproj +++ b/HIP-Basic/device_query/device_query_vs2019.vcxproj @@ -20,7 +20,7 @@ 15.0 {C2C6E811-57E3-44C5-9AB9-195D60A1638C} Win32Proj - example_template_vs2019 + device_query_vs2019 10.0 @@ -52,9 +52,11 @@ true + hip_$(ProjectName) false + hip_$(ProjectName) gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj b/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj index 6cfab55e..659e43e1 100644 --- a/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj +++ b/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj @@ -52,9 +52,11 @@ true + hip_$(ProjectName) false + hip_$(ProjectName) gfx1030 diff --git a/HIP-Basic/events/README.md b/HIP-Basic/events/README.md index 4777662f..ff9bd0f9 100644 --- a/HIP-Basic/events/README.md +++ b/HIP-Basic/events/README.md @@ -1,6 +1,6 @@ # HIP-Basic Events Example ## Description -Memory transfer and kernel execution are the most important parameter in parallel computing (especially HPC and machine learning). Memory bottlenecks are the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization. +Memory transfer and kernel execution are the most important parameters in parallel computing, especially in high performance computing (HPC) and machine learning. Memory bottlenecks are the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization. This example showcases measuring kernel and memory transfer timing using HIP events. The kernel under measurement is a trivial one that performs square matrix transposition. @@ -8,11 +8,14 @@ This example showcases measuring kernel and memory transfer timing using HIP eve 1. A number of parameters are defined that control the problem details and the kernel launch. 2. Input data is set up in host memory. 3. The necessary amount of device memory is allocated. -4. A pair of `hipEvent` objects are defined and initialized. Time measurement is started on the `start` event. -5. Memory transfer from host to device of the input data is performed, and the measurement is stopped using the `stop` event. The execution time is calculated via the `start` and `stop` events and it is printed to the standard output. -6. The kernel is launched, and its runtime is measured similarly using the `start` and `stop` events. -7. The result data is copied back to the host, and the execution time of the copy is measured similarly. -8. The result data is validated by comparing it to the product of the reference (host) implementation. The result of the validation is printed to the standard output. +4. A pair of `hipEvent` objects are defined and initialized. +5. Time measurement is started on the `start` event. +6. Memory transfer from host to device of the input data is performed. +7. The time measurement is stopped using the `stop` event. The execution time is calculated via the `start` and `stop` events and it is printed to the standard output. +8. The kernel is launched, and its runtime is measured similarly using the `start` and `stop` events. +9. The result data is copied back to the host, and the execution time of the copy is measured similarly. +10. The allocated device memory is freed and the event objects are released. +11. The result data is validated by comparing it to the product of the reference (host) implementation. The result of the validation is printed to the standard output. ## Key APIs and Concepts - The `hipEvent_t` type defines HIP events that can be used for synchronization and time measurement. The events must be initialized using `hipEventCreate` before usage and destroyed using `hipEventDestroy` after they are no longer needed. diff --git a/HIP-Basic/events/main.hip b/HIP-Basic/events/main.hip index 8747c533..5549c5d0 100644 --- a/HIP-Basic/events/main.hip +++ b/HIP-Basic/events/main.hip @@ -27,7 +27,7 @@ #include #include -#include +#include /// \brief Performs a simple matrix transpose on the GPU. __global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width) @@ -154,10 +154,12 @@ int main() const auto ref_transposed_matrix = matrix_transpose_reference(h_matrix, width); // Check the results' validity. - constexpr double eps = 1.0E-6; - unsigned int errors{}; + constexpr float eps = 1.0E-6F; + unsigned int errors = 0; for(unsigned int i = 0; i < size; i++) { + // Most likely the values are bitwise equal, since they were plainly copied, + // however it is a good practice to compare floating point values using an epsilon. if(std::abs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps) { errors++; diff --git a/HIP-Basic/hipify/main.cu b/HIP-Basic/hipify/main.cu index 1f30febd..a1f0abf5 100644 --- a/HIP-Basic/hipify/main.cu +++ b/HIP-Basic/hipify/main.cu @@ -89,7 +89,7 @@ int main() // Copy the input from host to the GPU device CHECK(cudaMemcpy(d_in, h_in.data(), size_in_bytes, cudaMemcpyHostToDevice)); - // Set the the number of blocks per kernel grid. + // Set the number of blocks per kernel grid. constexpr unsigned int grid_size = 512; // Set the number of threads per kernel block. constexpr unsigned int threads_per_block = 256; diff --git a/HIP-Basic/llvm_ir_to_executable/.gitignore b/HIP-Basic/llvm_ir_to_executable/.gitignore new file mode 100644 index 00000000..85472506 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/.gitignore @@ -0,0 +1,4 @@ +hip_llvm_ir_to_executable +*.bc +*.o +*.hipfb diff --git a/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt b/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt new file mode 100644 index 00000000..9796dca6 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt @@ -0,0 +1,174 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_llvm_ir_to_executable) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") + +# Only supported on HIP (not CUDA) +if(NOT "${GPU_RUNTIME}" STREQUAL "HIP") + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +if (NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for") +else() + set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for") +endif() + +if(GPU_ARCHITECTURES STREQUAL "all") + set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE) +endif() + +# Remove duplicates +list(REMOVE_DUPLICATES GPU_ARCHITECTURES) +message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}") + +set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only") + +if (WIN32) + set(OBJ_TYPE obj) + set(NULDEV NUL) + set(HOST_TARGET x86_64-pc-windows-msvc) + set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin) +else() + set(OBJ_TYPE o) + set(NULDEV /dev/null) + set(HOST_TARGET x86_64-unknown-linux) + set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin) +endif() + +# Assemble the device assemblies to object files using the HIP compiler. +# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file +# for the right GPU. +foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) + message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") + add_custom_command( + OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE} + COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE} + ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll + -o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll + VERBATIM) +endforeach() + +# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool. +find_program( + OFFLOAD_BUNDLER_COMMAND clang-offload-bundler + PATH_SUFFIXES bin + PATHS + ${ROCM_ROOT}/llvm + ${CMAKE_INSTALL_PREFIX}/llvm + REQUIRED) + +if(OFFLOAD_BUNDLER_COMMAND) + message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}") +else() + message(FATAL_ERROR "clang-offload-bundler not found") +endif() + +# Generate object bundle. +# The invocation to generate is +# clang-offload-bundler -targets= -input= -inputs= ... -output= +# Note that the host target must be the first target present here, and it should have an empty input associated to it. + +# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},... +set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}") +# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ... +set(BUNDLE_INPUTS "-input=${NULDEV}") +# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} +set(BUNDLE_OBJECTS "") +foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) + set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}") + list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") + list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") +endforeach() + +# Invoke clang-offload-bundler to generate an offload bundle. +set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb") +add_custom_command( + OUTPUT "${BUNDLE}" + COMMAND + "${OFFLOAD_BUNDLER_COMMAND}" + -type=o + -bundle-align=4096 + "${BUNDLE_TARGETS}" + ${BUNDLE_INPUTS} + "-output=${BUNDLE}" + DEPENDS ${BUNDLE_OBJECTS} + VERBATIM) + +# Create the device binary by assembling the template that includes +# the offload bundle that was just generated using an .incbin directive. +# This needs an assembler. +find_program( + LLVM_MC_COMMAND llvm-mc + PATH_SUFFIXES bin + PATHS + ${ROCM_ROOT}/llvm + ${CMAKE_INSTALL_PREFIX}/llvm) + +if(LLVM_MC_COMMAND) + message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}") +else() + message(FATAL_ERROR "llvm-mc not found") +endif() + +# Invoke llvm-mc to generate an object file containing the offload bundle. +set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}") +add_custom_command( + OUTPUT "${DEVICE_OBJECT}" + COMMAND + "${LLVM_MC_COMMAND}" + -triple "${HOST_TARGET}" + "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}" + -o "${DEVICE_OBJECT}" + --filetype=obj + DEPENDS "${BUNDLE}" + VERBATIM) + +# Finally, create the executable. +add_executable( + ${example_name} + main.hip + ${DEVICE_OBJECT}) + +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/llvm_ir_to_executable/Makefile b/HIP-Basic/llvm_ir_to_executable/Makefile new file mode 100644 index 00000000..5fb8b2d0 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/Makefile @@ -0,0 +1,90 @@ +# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +EXAMPLE := hip_llvm_ir_to_executable +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME ?= HIP + +ifneq ($(GPU_RUNTIME), HIP) +$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.) +endif + + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc +CLANG ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang +LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc +CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +# Compile for these GPU architectures +HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030 + +# If white-space is given as a literal the `subst` cannot recognize it. +# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS +# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER +# in the targets option. The targets option needs to be a single string with no spaces. +empty = +space = $(empty) $(empty) +comma = , + +GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES)) +GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%)) + +all: $(EXAMPLE) + +$(EXAMPLE): main.o main_device.o + $(HIPCXX) -o $@ $^ + +main_device.o: hip_obj_gen.mcin offload_bundle.hipfb + $(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj + +offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) + $(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \ + -targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \ + -input=/dev/null \ + $(^:%=-input=%) \ + -output=$@ + +main.o: main.hip + $(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $< + +main_%.o: main_%.ll + $(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $< + +clean: + rm -f \ + main_device*.o \ + main_*.bc \ + offload_bundle.hipfb \ + main_device.o \ + main.o \ + $(EXAMPLE) + +.PHONY: clean $(EXAMPLE) diff --git a/HIP-Basic/llvm_ir_to_executable/README.md b/HIP-Basic/llvm_ir_to_executable/README.md new file mode 100644 index 00000000..5189bc80 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/README.md @@ -0,0 +1,117 @@ +# HIP-Basic LLVM-IR to Executable Example + +## Description +This example shows how to manually compile and link a HIP application from device LLVM IR. Pre-generated LLVM-IR files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable. + +LLVM IR is the intermediary language used by the LLVM compiler, which hipcc is built on. Building HIP executables from LLVM IR can be useful for example to experiment with specific LLVM instructions, or can help debugging miscompilations. + +### Building + +- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma. + ```shell + make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" + ``` +- Build with CMake: + ```shell + cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" + cmake --build build + ``` + On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"` + +## Generating device LLVM IR +In this example, a HIP executable is compiled from device LLVM IR code. LLVM IR can be written completely manually, but in this example they are generated from `main.hip`, using the following commands: +```shell +$ROCM_INSTALL_DIR/bin/hipcc -cuda-device-only -c -emit-llvm ./main.hip --offload-arch= -o main_.bc -I ../../Common +$ROCM_INSTALL_DIR/bin/llvm-dis main_.bc -o main_.ll +``` +Where `` is the architecture to generate the LLVM IR for. Note that the `--cuda-device-only` flag is required to instruct `hipcc` to only generate LLVM IR for the device part of the computation, and `-c` is required to prevent the compiler from linking the ouputs into an executable. In the case of this example, the LLVM IR files where generated using architectures `gfx803`, `gfx900`, `gfx906`, `gfx908`, `gfx90a`, `gfx1030`. The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=:+` to enable it or `--offload-arch=:-` to disable it. Multiple features may be present, separated by colons. + +The first of these two commands generates a _bitcode_ module: this is a binary encoded version of LLVM IR. The second command, using `llvm-dis` disassembles the bitcode module into textual LLVM IR. + +## Build Process +A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device LLVM IR and host HIP code according to the following process: + +1. The `main.hip` file is compiled to an object file with `hipcc` that only contains host code by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead. + ```shell + $ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip + ``` + +2. Each LLVM IR file is assembled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`: + + ```shell + $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.ll -o main_gfx1030.o + $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu= main_.ll -o main_.o + ... + ``` + +3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `--`. For HIP device code, `` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `` `host`. The order of targets and inputs must match. `` is an LLVM target triple, which is specified as `---`. `` is left empty for AMD targets. + + ```shell + $ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \ + -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \ + -input=/dev/null \ + -input=main_gfx1030.o -input=... \ + -output=offload_bundle.hipfb + ``` + + Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling the source to LLVM bitcode. + +4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive: + ```nasm + .type __hip_fatbin,@object + ; Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"a",@progbits + ; Make the symbol that addresses the binary public + .globl __hip_fatbin + ; Give the bundle the required alignment + .p2align 12 + __hip_fatbin: + ; Include the binary + .incbin "offload_bundle.hipfb" + ``` + This file can then be assembled using `llvm-mc` as follows: + ```shell + $ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple -o main_device.o hip_obj_gen.mcin --filetype=obj + ``` + +5. Finally, using the system linker, `hipcc`, or `clang`, the host object and device objects are linked into an executable: + ```shell + /hip/bin/hipcc -o hip_llvm_ir_to_executable main.o main_device.o + ``` + +### Visual Studio 2019 +The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools: +- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`: + ``` + Additional Options: --cuda-host-only + ``` +- Each device LLVM IR .ll file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section: + ``` + Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + Description: Compiling Device Assembly %(Identity) + Output: $(IntDir)%(FileName).o + Execute Before: ClCompile + ``` +- Steps 3 and 4 are implemented using a custom build step: + ``` + Command Line: + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" + cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + Description: Generating Device Offload Object + Outputs: $(IntDIr)main_device.obj + Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + Execute Before: ClCompile + ``` +- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`: + ``` + Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies) + ``` + +## Used API surface +### HIP runtime +- `hipFree` +- `hipGetDeviceProperties` +- `hipGetLastError` +- `hipLaunchKernelGGL` +- `hipMalloc` +- `hipMemcpy` diff --git a/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin new file mode 100644 index 00000000..6b9fee5f --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin @@ -0,0 +1,21 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# HIP Object Generator +# Use this generator to create a host bundled object file +# with the input of an offload bundled fat binary. +# +# Input: Bundled Object file .hipfb file +# Output: Host Bundled Object File .o + + .type __hip_fatbin,@object + # Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"a",@progbits + # Make the symbol that addresses the binary public. + .globl __hip_fatbin + # Give the bundle the required alignment of 4096 (2 ^ 12). + .p2align 12 +__hip_fatbin: + # Include the offload bundle. + .incbin "offload_bundle.hipfb" diff --git a/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin new file mode 100644 index 00000000..3636354e --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin @@ -0,0 +1,20 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# HIP Object Generator +# Use this generator to create a host bundled object file +# with the input of an offload bundled fat binary. +# +# Input: Bundled Object file .hipfb file +# Output: Host Bundled Object File .o + + # Tell the assembler to place the offload bundle in the appropriate section. + .section .hip_fatbin,"dw" + # Make the symbol that addresses the binary public. + .globl __hip_fatbin + # Give the bundle the required alignment of 4096 (2 ^ 12). + .p2align 12 +__hip_fatbin: + # Include the offload bundle. + .incbin "offload_bundle.hipfb" diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln new file mode 100644 index 00000000..a53dc2ec --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {0A13532C-E06B-4427-9847-54070C1E8622} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj new file mode 100644 index 00000000..c0e820b4 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + --cuda-host-only + --cuda-host-only + + + + + + + + Document + copy %(Identity) "$(IntDir)%(Identity)" + Copying %(Identity) + $(IntDir)%(Identity) + copy %(Identity) "$(IntDir)%(Identity)" + Copying %(Identity) + $(IntDir)%(Identity) + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803 + + + Document + "$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900 + "$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908 + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908 + + + Document + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a + + + + 15.0 + {dbb8dfe9-cb1b-473c-937c-2a8120e0d819} + Win32Proj + llvm_ir_to_executable_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + ClCompile + + + false + hip_$(ProjectName) + ClCompile + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + $(IntDir)main_device.obj;%(AdditionalDependencies) + + + Compiling Device LLVM IR %(Identity) + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa + $(IntDir)%(FileName).o + + + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" +cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + + + Generating Device Offload Object + + + $(IntDIr)main_device.obj + + + $(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + true + true + $(IntDir)main_device.obj;%(AdditionalDependencies) + + + Compiling Device LLVM IR %(Identity) + "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa + $(IntDir)%(FileName).o + + + "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=NUL "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" +cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj + + + Generating Device Offload Object + + + $(IntDIr)main_device.obj + + + $(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) + + + + + + + diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters new file mode 100644 index 00000000..25c408b7 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters @@ -0,0 +1,53 @@ + + + + + {4f2a1544-a556-4afb-b630-36ba54c0ab4a} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {b93521e0-9944-411a-9f6e-4071af6bc7ea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {972f07c3-b925-4516-bd65-2d5a3f626888} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + diff --git a/HIP-Basic/llvm_ir_to_executable/main.hip b/HIP-Basic/llvm_ir_to_executable/main.hip new file mode 100644 index 00000000..588fc070 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main.hip @@ -0,0 +1,118 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" + +#include + +#include +#include +#include + +/// \brief Device function to square each element +/// in the array `in` and write to array `out`. +template +__global__ void vector_square_kernel(T* out, const T* in, const long long size) +{ + // Get the unique global thread ID + const size_t offset = blockIdx.x * blockDim.x + threadIdx.x; + // Each thread hops stride amount of elements to find the next + // element to square + const size_t stride = blockDim.x * gridDim.x; + + for(size_t i = offset; i < size; i += stride) + { + out[i] = in[i] * in[i]; + } +} + +int main() +{ + // Set the problem size + constexpr size_t size = 1000000; + constexpr size_t size_in_bytes = size * sizeof(float); + + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/)); + std::cout << "info: running on device " << props.name << "\n"; + + std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) " + << "\n"; + + // Declare the host side arrays + std::vector h_in(size); + std::vector h_out(size); + + // Initialize the host size input + for(size_t i = 0; i < size; i++) + { + h_in[i] = 1.618f + i; + } + + // Declare the device side arrays + float *d_in, *d_out; + std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n"; + // Allocate the device side memory + HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); + HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); + + std::cout << "info: copy Host2Device\n"; + + // Copy the input from host to the GPU device + HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); + + // Set the number of blocks per kernel grid. + constexpr unsigned int grid_size = 512; + // Set the number of threads per kernel block. + constexpr unsigned int threads_per_block = 256; + + std::cout << "info: launch 'vector_square_kernel' kernel\n"; + hipLaunchKernelGGL(vector_square_kernel, + grid_size, + threads_per_block, + 0, + hipStreamDefault, + d_out, + d_in, + size); + + // Check that the kernel invocation was successful. + HIP_CHECK(hipGetLastError()); + + std::cout << "info: copy Device2Host\n"; + HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + + std::cout << "info: check result\n"; + for(size_t i = 0; i < size; i++) + { + if(h_out[i] != h_in[i] * h_in[i]) + { + std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i] + << ", expected: " << h_in[i] * h_in[i] << '\n'; + exit(error_exit_code); + } + } + std::cout << "PASSED!\n"; +} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll new file mode 100644 index 00000000..31c713de --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx1030.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll new file mode 100644 index 00000000..a0d9f588 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx803.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx803" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll new file mode 100644 index 00000000..67ff0a30 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx900.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll new file mode 100644 index 00000000..76819daf --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx906.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll new file mode 100644 index 00000000..50a94f21 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx908.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll new file mode 100644 index 00000000..dc293da3 --- /dev/null +++ b/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll @@ -0,0 +1,97 @@ +; ModuleID = 'main_gfx90a.bc' +source_filename = "./main.hip" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } +%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } + +$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any + +$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any + +$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any + +@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 +@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 +@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind +define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { + %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 + %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 + %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 + %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* + %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 + %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 + %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* + %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 + %12 = zext i16 %11 to i32 + %13 = mul i32 %4, %12 + %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 + %15 = add i32 %13, %14 + %16 = zext i32 %15 to i64 + %17 = zext i32 %8 to i64 + %18 = icmp ult i64 %16, %2 + br i1 %18, label %20, label %19 + +19: ; preds = %20, %3 + ret void + +20: ; preds = %3, %20 + %21 = phi i64 [ %26, %20 ], [ %16, %3 ] + %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 + %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 + %24 = fmul contract float %23, %23 + %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 + store float %24, float addrspace(1)* %25, align 4, !tbaa !16 + %26 = add i64 %21, %17 + %27 = icmp ult i64 %26, %2 + br i1 %27, label %20, label %19, !llvm.loop !20 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 1} +!2 = !{i32 2, i32 0} +!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} +!4 = !{!5, !9, i64 12} +!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} +!6 = !{!"short", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!"int", !7, i64 0} +!10 = !{!"long", !7, i64 0} +!11 = !{!"any pointer", !7, i64 0} +!12 = !{!"hsa_signal_s", !10, i64 0} +!13 = !{i16 1, i16 1025} +!14 = !{} +!15 = !{i32 0, i32 1024} +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C++ TBAA"} +!20 = distinct !{!20, !21} +!21 = !{!"llvm.loop.mustprogress"} diff --git a/HIP-Basic/matrix_multiplication/Makefile b/HIP-Basic/matrix_multiplication/Makefile index 151aa45b..ba6d2ade 100644 --- a/HIP-Basic/matrix_multiplication/Makefile +++ b/HIP-Basic/matrix_multiplication/Makefile @@ -45,7 +45,7 @@ else $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) endif -$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/cmdparser.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp $(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ clean: diff --git a/HIP-Basic/matrix_multiplication/argument_parsing.hpp b/HIP-Basic/matrix_multiplication/argument_parsing.hpp deleted file mode 100644 index 80ffc270..00000000 --- a/HIP-Basic/matrix_multiplication/argument_parsing.hpp +++ /dev/null @@ -1,86 +0,0 @@ -// MIT License -// -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP -#define HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP - -#include "example_utils.hpp" - -#include -#include -#include -#include - -#include - -/// \brief Tries to read the matrix dimensions from the command line. -/// If no command line arguments were provided, the passed values are not modified. -/// Otherwise, the number of arguments must be 3: -/// (B rows will be equal to A columns). -/// If the number of arguments is different, or the arguments cannot be parsed to -/// unsigned ints, an error message is printed and the program exits with a non-zero code. -inline void matrix_dimensions_from_command_line(const int argc, - const char* argv[], - unsigned int& a_rows, - unsigned int& a_cols, - unsigned int& b_cols, - const unsigned int block_size) -{ - const auto print_usage_and_exit = [=]() - { - const std::string usage_message - = "Calculates matrix product A*B.\n" - "Usage: hip_matrix_multiplication [ ].\n" - "Matrix dimensions must be positive multiples of block_size (" - + std::to_string(block_size) + ")"; - std::cout << usage_message << std::endl; - exit(error_exit_code); - }; - const auto get_argument_by_index = [=](const unsigned int index) -> unsigned int - { - const std::string_view argument_text(argv[index]); - - unsigned int converted_value; - const auto conversion_result = std::from_chars(argument_text.data(), - argument_text.data() + argument_text.size(), - converted_value); - if(conversion_result.ec != std::errc{} || (converted_value % block_size) != 0) - { - print_usage_and_exit(); - } - return converted_value; - }; - - if(argc == 1) - { - return; - } - if(argc != 4) - { - print_usage_and_exit(); - } - a_rows = get_argument_by_index(1); - a_cols = get_argument_by_index(2); - b_cols = get_argument_by_index(3); -} - -#endif // HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP diff --git a/HIP-Basic/matrix_multiplication/main.hip b/HIP-Basic/matrix_multiplication/main.hip index e651856a..90836e37 100644 --- a/HIP-Basic/matrix_multiplication/main.hip +++ b/HIP-Basic/matrix_multiplication/main.hip @@ -20,7 +20,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "argument_parsing.hpp" +#include "cmdparser.hpp" #include "example_utils.hpp" #include @@ -108,18 +108,53 @@ __global__ void matrix_multiplication_kernel(const float* A, // Every thread stores the final result to global memory. C[block_offset + b_cols * ty + tx] = thread_result; } +template +void configure_parser(cli::Parser& parser) +{ + // Default parameters + constexpr unsigned int a_rows = 2048; + constexpr unsigned int a_cols = 1024; + constexpr unsigned int b_cols = 1024; + + static_assert( + ((a_rows % BlockSize == 0) && (a_cols % BlockSize == 0) && (b_cols % BlockSize == 0)), + "Matrix dimensions must be positive multiples of block_size"); + + parser.set_optional("A_rows", + "A_rows", + a_rows, + "Number of rows in Matrix A"); // Default 2048 + parser.set_optional("A_cols", + "A_cols", + a_cols, + "Number of columns in Matrix A"); // Default 1024 + parser.set_optional("B_cols", + "B_cols", + b_cols, + "Number of columns in Matrix B"); // Default 1024 +} int main(int argc, const char* argv[]) { constexpr unsigned int block_size = 16; - // Default values are provided below. - unsigned int a_rows = 2048; - unsigned int a_cols = 1024; - unsigned int b_cols = 1024; + // Parse user inputs + cli::Parser parser(argc, argv); + configure_parser(parser); + parser.run_and_exit_if_error(); // Get matrix dimensions from the command line, if provided. - matrix_dimensions_from_command_line(argc, argv, a_rows, a_cols, b_cols, block_size); + const unsigned int a_rows = parser.get("A_rows"); + const unsigned int a_cols = parser.get("A_cols"); + const unsigned int b_cols = parser.get("B_cols"); + + if((a_rows % block_size != 0) || (a_cols % block_size != 0) || (b_cols % block_size != 0)) + { + std::cout << "Matrix dimensions must be positive multiples of block_size (" + + std::to_string(block_size) + ")" + << std::endl; + exit(error_exit_code); + } // Outer matrix dimensions must match. const unsigned int b_rows = a_cols; diff --git a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln index 9297291b..b11412dc 100644 --- a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln +++ b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.32630.194 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_template_vs2019", "example_template_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_multiplication_vs2019", "matrix_multiplication_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj index 4d1790e6..81bac082 100644 --- a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj +++ b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj @@ -1,97 +1,101 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - 15.0 - {ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E} - Win32Proj - matrix_multiplication_vs2019 - 10.0 - - - - Application - true - HIP - Unicode - - - Application - false - HIP - true - Unicode - - - - - - - - - - - - - - - - true - - - false - - - gfx1030 - - - gfx1030 - - - - Level1 - __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - stdcpp17 - $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) - - - Console - true - - - - - Level2 - true - true - __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - stdcpp17 - $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) - - - Console - true - true - true - - - - - - - \ No newline at end of file + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E} + Win32Proj + matrix_multiplication_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + true + + + Console + true + true + true + + + + + + + diff --git a/HIP-Basic/occupancy/README.md b/HIP-Basic/occupancy/README.md index aced51a6..d641278e 100644 --- a/HIP-Basic/occupancy/README.md +++ b/HIP-Basic/occupancy/README.md @@ -1,7 +1,7 @@ -# HIP-Basic Occupany Example +# HIP-Basic Occupancy Example ## Description -This example showcases how to find optimal configuation parameters for a kernel launch with maximum occupancy. It uses the HIP occupancy calculator APIs to find a kernel launch configuration that yields maximum occupancy. This configuration is used to launch a kernel and measures the utilization difference against another kernel launch that is manually (and suboptimally) configured. The application kernel is a simple vector-vector multiplication of the form `C[i] = A[i]*B[i]`, where `A`, `B` and `C` are vectors of size `size`. +This example showcases how to find optimal configuration parameters for a kernel launch with maximum occupancy. It uses the HIP occupancy calculator APIs to find a kernel launch configuration that yields maximum occupancy. This configuration is used to launch a kernel and measures the utilization difference against another kernel launch that is manually (and suboptimally) configured. The application kernel is a simple vector--vector multiplication of the form `C[i] = A[i]*B[i]`, where `A`, `B` and `C` are vectors of size `size`. The example shows 100% occupancy for both manual and automatic configurations, because the simple kernel does not use much resources per-thread or per-block, especially `__shared__` memory. The execution time for the automatic launch is still lower because of a lower overhead associated with fewer blocks being executed. diff --git a/HIP-Basic/occupancy/occupancy_vs2019.vcxproj b/HIP-Basic/occupancy/occupancy_vs2019.vcxproj index b8eee422..a1c151fa 100644 --- a/HIP-Basic/occupancy/occupancy_vs2019.vcxproj +++ b/HIP-Basic/occupancy/occupancy_vs2019.vcxproj @@ -1,95 +1,99 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - 15.0 - {e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5} - Win32Proj - occupancy_vs2019 - 10.0 - - - - Application - true - HIP - Unicode - - - Application - false - HIP - true - Unicode - - - - - - - - - - - - - - - - true - - - false - - - gfx1030 - - - gfx1030 - - - - Level1 - __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) - - - Console - true - - - - - Level2 - true - true - __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) - - - Console - true - true - true - - - - - - - \ No newline at end of file + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5} + Win32Proj + occupancy_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 + + + Console + true + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 + + + Console + true + true + true + + + + + + + diff --git a/HIP-Basic/runtime_compilation/.gitignore b/HIP-Basic/runtime_compilation/.gitignore new file mode 100644 index 00000000..080d9030 --- /dev/null +++ b/HIP-Basic/runtime_compilation/.gitignore @@ -0,0 +1 @@ +hip_runtime_compilation diff --git a/HIP-Basic/runtime_compilation/CMakeLists.txt b/HIP-Basic/runtime_compilation/CMakeLists.txt new file mode 100644 index 00000000..47974f94 --- /dev/null +++ b/HIP-Basic/runtime_compilation/CMakeLists.txt @@ -0,0 +1,70 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_runtime_compilation) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(link_libs "") +set(include_dirs "../../Common") + +if(GPU_RUNTIME STREQUAL "HIP") + # Link hiprtc library + find_library(HIPRTC_LIB hiprtc REQUIRED) + list(APPEND link_libs "${HIPRTC_LIB}") +endif() + +if(GPU_RUNTIME STREQUAL "CUDA") + # Include the HIP header directory. + list(APPEND include_dirs "${ROCM_ROOT}/include") + # In this example we also need to link nvrtc CUDA library + find_package("CUDAToolkit" REQUIRED) + list(APPEND link_libs "CUDA::nvrtc") +endif() + +target_link_libraries(${example_name} ${link_libs}) +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/runtime_compilation/Makefile b/HIP-Basic/runtime_compilation/Makefile new file mode 100644 index 00000000..65ba415d --- /dev/null +++ b/HIP-Basic/runtime_compilation/Makefile @@ -0,0 +1,55 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := hip_runtime_compilation +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + CXXFLAGS += -x cu + CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) + LDLIBS += -l nvrtc +else ifeq ($(GPU_RUNTIME), HIP) +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp + $(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/HIP-Basic/runtime_compilation/README.md b/HIP-Basic/runtime_compilation/README.md new file mode 100644 index 00000000..a1c7e39e --- /dev/null +++ b/HIP-Basic/runtime_compilation/README.md @@ -0,0 +1,91 @@ +# HIP-Basic Runtime Compilation Example + +## Description + +Runtime compilation allows compiling fragments of source code to machine code at runtime, when a program is already running, rather than compiling the code ahead of time. HIP supports runtime compilation through hipRTC, which can be used to compile HIP device code at runtime. This permits specific optimizations that depend on values determined at runtime. Therefore, usage of hipRTC provides the possibility of obtaining optimizations and performance improvements over offline compilation. + +This example showcases how to make use of hipRTC to compile in runtime a kernel and launch it on a device. This kernel is a simple SAXPY, i.e. a single-precision operation $y_i=ax_i+y_i$. + +### Application flow +The diagram below summarizes the runtime compilation part of the example. +1. A number of variables are declared and defined to configure the program which will be compiled in runtime. +2. The program is created using the above variables as parameters, along with the SAXPY kernel in string form. +3. The properties of the first device (GPU) available are consulted to set the device architecture as (the only) compile option. +4. The program is compiled using the previously mentioned compile options. +5. If exists, the log generated during the compile process is printed to the standard output. +6. The binary compiled from the program is stored as a vector of characters and the program object is destroyed. +7. Begin the preparation for the launch of the kernel on the device. A number of constants are defined to control the problem details and the kernel launch parameters. +8. The two input vectors, $x$ and $y$, are instantiated in host memory and filled with the increasing sequences $1, 2, 3, 4, ...$ and $2, 4, 6, 8, ...$, respectively. +9. The necessary amount of device (GPU) memory is allocated and the elements of the input vectors are copied to the device memory. +10. A HIP module corresponding to the compiled binary is loaded into the current context and the SAXPY kernel is extracted from it into a HIP function object. +11. The kernel launch configuration options and its arguments are declared and defined. +12. A trace message is printed to the standard output. +13. The GPU kernel is then launched with the above mentioned options along with the constants defined previously. +14. The results are copied back to host vector $y$. +15. The previously allocated device memory is freed. +16. The module is unloaded from the current context and freed. +17. The first few elements of the result vector $y$ are printed to the standard output. + +![hiprtc.svg](hiprtc.svg) +## Key APIs and Concepts +- `hipGetDeviceProperties` extracts the properties of the desired device. In this example it is used to get the GPU architecture. +- `hipModuleGetFunction` extracts a handle for a function with a certain name from a given module. Note that if no function with that name is present in the module this method will return an error. +- `hipModuleLaunchKernel` queues the launch of the provided kernel on the device. This function normally presents an asynchronous behaviour (see `HIP_LAUNCH_BLOCKING`), i.e. a call to it may return before the device finishes the execution of the kernel. Its parameters are the following: + - The kernel to be launched. + - Number of blocks in the dimension X of kernel grid, i.e. the X component of grid size. + - Number of blocks in the dimension Y of kernel grid, i.e. the Y component of grid size. + - Number of blocks in the dimension Z of kernel grid, i.e. the Z component of grid size. + - Number of threads in the dimension X of each block, i.e. the X component of block size. + - Number of threads in the dimension Y of each block, i.e. the Y component of block size. + - Number of threads in the dimension Z of each block, i.e. the Z component of block size. + - Amount of dynamic shared memory that will be available to each workgroup, in bytes. Not used in this example. + - The device stream, on which the kernel should be dispatched. If 0 (or NULL), the NULL stream will be used. In this example the latter is used. + - Pointer to the arguments needed by the kernel. Note that this parameter is not yet implemented, and thus the _extra_ parameter (the last one described in this list) should be used to pass arguments to the kernel. + - Pointer to all extra arguments passed to the kernel. They must be in the memory layout and alignment expected by the kernel. The list of arguments must end with `HIP_LAUNCH_PARAM_END`. +- `hipModuleLoadData` builds a module from a code (compiled binary) object residing in host memory and loads it into the current context. Note that in this example this function is called right after `hipMalloc`. This is due to the fact that, on CUDA, `hipModuleLoadData` will fail if it is not called after some runtime API call is done (as it will implicitly intialize a current context) or if there is not an explicit creation of a (current) context. +- `hipModuleUnload` unloads the specified module from the current context and frees it. +- `hiprtcCompileProgram` compiles the given program in runtime. Some compilation options may be passed as parameters to this function. In this example, the GPU architeture is the only compilation option. +- `hiprtcCreateProgram` instantiates a runtime compilation program from the given parameters. Those are the following: + - The runtime compilation program object that will be set with the new instance. + - A pointer to the program source code. + - A pointer to the program name. + - The number of headers to be included. + - An array of pointers to the headers names. + - An array of pointers to the names to be included in the source program. + + In this example the program is created including two header files to illustrate how to pass all of the above arguments to this function. +- `hiprtcDestroyProgram` destroys an instance of a given runtime compilation program object. +- `hiprtcGetProgramLog` extracts the char pointer to the log generated during the compilation of a given runtime compilation program. +- `hiprtcGetProgramLogSize` returns the compilation log size of a given runtime compilation program, measured as number of characters. +- `hiprtcGetCode` extracts the char pointer to the compilation binary in memory from a runtime compilation program object. This binary is needed to load the corresponding HIP module into the current context and extract from it the kernel(s) that will be executed on the GPU. +- `hiprtcGetCodeSize` returns the size of the binary compiled of a given runtime compilation program, measured as number of characters. + +## Demonstrated API Calls + +### HIP runtime + +#### Device symbols +- `threadIdx`, `blockIdx`, `blockDim` + +#### Host symbols +- `hipFree` +- `hipGetDeviceProperties` +- `hipGetLastError` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` +- `hipModuleGetFunction` +- `hipModuleLaunchKernel` +- `hipModuleLoadData` +- `hipModuleUnload` +- `hiprtcCompileProgram` +- `hiprtcCreateProgram` +- `hiprtcDestroyProgram` +- `hiprtcGetCode` +- `hiprtcGetCodeSize` +- `hiprtcGetProgramLog` +- `hiprtcGetProgramLogSize` +- `HIP_LAUNCH_PARAM_BUFFER_POINTER` +- `HIP_LAUNCH_PARAM_BUFFER_SIZE` +- `HIP_LAUNCH_PARAM_END` diff --git a/HIP-Basic/runtime_compilation/hiprtc.svg b/HIP-Basic/runtime_compilation/hiprtc.svg new file mode 100644 index 00000000..15aa28dc --- /dev/null +++ b/HIP-Basic/runtime_compilation/hiprtc.svg @@ -0,0 +1,3 @@ + + +
const char* src
const char* src
hiprtcProgram
hiprtcProgram
hiprtcProgram
hiprtcProgram
hiprtcCreateProgram
hiprtcCreateProgram
hiprtcCompileProgram
hiprtcCompileProgram
char* bin
char* bin
hiprtcGetCode
hiprtcGetCode
hiprtcDestroyProgram
hiprtcDestroyProgram
hipModuleLoadData
hipModuleLoadData
hipModule_t
hipModule_t
hipModuleGetFunction
hipModuleGetFunction
hipFunction_t
hipFunction_t
hipModuleLaunchKernel
hipModuleLaunchKernel
hipModuleUnload
hipModuleUnload
const char* kernel_name
const char* kernel_name
Text is not SVG - cannot display
\ No newline at end of file diff --git a/HIP-Basic/runtime_compilation/main.hip b/HIP-Basic/runtime_compilation/main.hip new file mode 100644 index 00000000..f979e420 --- /dev/null +++ b/HIP-Basic/runtime_compilation/main.hip @@ -0,0 +1,215 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" + +#include +#include + +#include +#include +#include +#include + +// SAXPY kernel stored as a string +static constexpr auto saxpy_kernel{ + R"( +#include "test_header.h" +#include "test_header1.h" +extern "C" +__global__ void saxpy_kernel(const real a, const realptr d_x, realptr d_y, const unsigned int size) +{ + const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x; + if(global_idx < size) + { + d_y[global_idx] = a * d_x[global_idx] + d_y[global_idx]; + } +} +)"}; + +int main() +{ + // Program to be compiled in runtime. + hiprtcProgram prog; + + // Vector containing example header names. + std::vector header_names; + header_names.push_back("test_header.h"); + header_names.push_back("test_header1.h"); + + // Vector containing example names to be included in the program. + std::vector header_sources; + header_sources.push_back("#ifndef HIPRTC_TEST_HEADER_H\n#define HIPRTC_TEST_HEADER_H\ntypedef " + "float real;\n#endif //HIPRTC_TEST_HEADER_H\n"); + header_sources.push_back( + "#ifndef HIPRTC_TEST_HEADER1_H\n#define HIPRTC_TEST_HEADER1_H\ntypedef float* " + "realptr;\n#endif //HIPRTC_TEST_HEADER1_H\n"); + + // Create program. + hiprtcCreateProgram(&prog, + saxpy_kernel, + "saxpy_kernel.cu", + header_sources.size(), + header_sources.data(), + header_names.data()); + + // Get device properties from the first device available. + hipDeviceProp_t props; + constexpr unsigned int device_id = 0; + HIP_CHECK(hipGetDeviceProperties(&props, device_id)); + + // Obtain architecture's name from device properties and initialize array of compile options. When in CUDA we omit this option. + std::string sarg + = (props.gcnArchName[0]) ? std::string("--gpu-architecture=") + props.gcnArchName : ""; + const char* options[] = {sarg.c_str()}; + const int num_options = !sarg.empty(); + + // Compile program in runtime. Parameters are the program, number of options and array with options. + const hiprtcResult compile_result{hiprtcCompileProgram(prog, num_options, options)}; + + // Get the size of the log (possibly) generated during the compilation. + size_t log_size; + hiprtcGetProgramLogSize(prog, &log_size); + + // If the compilation generated a log, print it. + if(log_size) + { + std::string log(log_size, '\0'); + hiprtcGetProgramLog(prog, &log[0]); + std::cout << log << std::endl; + } + + // If the compilation failed, say so and exit. + if(compile_result != HIPRTC_SUCCESS) + { + std::cout << "Error: compilation failed." << std::endl; + return EXIT_FAILURE; + } + + // Get the size (in number of characters) of the binary compiled from the program. + size_t code_size; + hiprtcGetCodeSize(prog, &code_size); + + // Store compiled binary as a vector of characters. + std::vector code(code_size); + hiprtcGetCode(prog, code.data()); + + // Destroy program object. + hiprtcDestroyProgram(&prog); + + // Now we launch the kernel on the device. + + // Total number of float elements in each device vector. + constexpr unsigned int size = 4096; + + // Total number of bytes to allocate for each device vector. + constexpr size_t size_bytes = size * sizeof(float); + + // Number of threads per kernel block. + constexpr unsigned int block_size = 128; + + // Number of blocks per kernel grid, calculated as ceil(size/block_size). + constexpr unsigned int grid_size = (size + block_size - 1) / block_size; + + // Constant value 'a' to be used in the expression 'a*x+y'. + constexpr float a = 5.1f; + + // Allocate x vector in host and fill it with increasing sequence 1, 2, 3, 4, ... . + std::vector x(size); + std::iota(x.begin(), x.end(), 1.f); + + // Allocate y vector in host and fill it with increasing sequence 2, 4, 6, 8, ... . + std::vector y(x); + std::for_each(y.begin(), y.end(), [](float& f) { f = 2 * f; }); + + // Allocate vectors in device and copy from host to device memory. + float* d_x{}; + float* d_y{}; + HIP_CHECK(hipMalloc(&d_x, size_bytes)); + HIP_CHECK(hipMalloc(&d_y, size_bytes)); + HIP_CHECK(hipMemcpy(d_x, x.data(), size_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_y, y.data(), size_bytes, hipMemcpyHostToDevice)); + + // Load the HIP module corresponding to the compiled binary into the current context. + hipModule_t module; + HIP_CHECK(hipModuleLoadData(&module, code.data())); + + // Extract SAXPY kernel from module into a function object. + hipFunction_t kernel; + HIP_CHECK(hipModuleGetFunction(&kernel, module, "saxpy_kernel")); + + // Create and fill array with kernel arguments. + size_t offset = 0; + char args[256] = {}; + + *(reinterpret_cast(&args[offset])) = a; + offset += sizeof(a); + offset += 4; // aligning fix for CUDA executions + *(reinterpret_cast(&args[offset])) = d_x; + offset += sizeof(d_x); + *(reinterpret_cast(&args[offset])) = d_y; + offset += sizeof(d_y); + *(reinterpret_cast(&args[offset])) = size; + offset += sizeof(size); + + // Create array with kernel arguments and its size. + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, + args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, + &offset, + HIP_LAUNCH_PARAM_END}; + + std::cout << "Calculating y[i] = a * x[i] + y[i] over " << size << " elements." << std::endl; + + // Launch the kernel on the NULL stream and with the above configuration. + HIP_CHECK(hipModuleLaunchKernel(kernel, + grid_size, + 1, + 1, + block_size, + 1, + 1, + 0, + nullptr, + nullptr, + (void**)&config)); + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()) + + // Copy results from device to host. + HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost)); + + // Free device memory. + HIP_CHECK(hipFree(d_x)); + HIP_CHECK(hipFree(d_y)); + + // Unload module. + HIP_CHECK(hipModuleUnload(module)); + + // Print the first few elements of the results for validation. + constexpr size_t elements_to_print = 10; + std::cout << "First " << elements_to_print << " elements of the results: " + << format_range(y.begin(), y.begin() + elements_to_print) << std::endl; + + return 0; +} \ No newline at end of file diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln new file mode 100644 index 00000000..584dd56d --- /dev/null +++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "runtime_compilation_vs2019", "runtime_compilation_vs2019.vcxproj", "{E03790B7-B203-4504-BEF5-F4F061183642}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.ActiveCfg = Debug|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.Build.0 = Debug|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.ActiveCfg = Release|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj new file mode 100644 index 00000000..5e0168be --- /dev/null +++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj @@ -0,0 +1,101 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {E03790B7-B203-4504-BEF5-F4F061183642} + Win32Proj + runtime_compilation_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + true + true + hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + + + \ No newline at end of file diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters new file mode 100644 index 00000000..591e9f2c --- /dev/null +++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters @@ -0,0 +1,27 @@ + + + + + {2932a426-602b-4926-887e-27c50ba7eab7} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {ed043ec4-e8ac-4831-93f5-a58546ec7bea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {0da954bd-e555-4454-b082-b68d10c753b9} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff --git a/HIP-Basic/saxpy/saxpy_vs2019.vcxproj b/HIP-Basic/saxpy/saxpy_vs2019.vcxproj index 1844610a..d9602491 100644 --- a/HIP-Basic/saxpy/saxpy_vs2019.vcxproj +++ b/HIP-Basic/saxpy/saxpy_vs2019.vcxproj @@ -52,15 +52,17 @@ true + hip_$(ProjectName) false + hip_$(ProjectName) - gfx1030;gfx90c:xnack- + gfx1030 - gfx1030;gfx90c:xnack- + gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/HIP-Basic/shared_memory/.gitignore b/HIP-Basic/shared_memory/.gitignore new file mode 100644 index 00000000..9c7163b7 --- /dev/null +++ b/HIP-Basic/shared_memory/.gitignore @@ -0,0 +1 @@ +hip_shared_memory diff --git a/HIP-Basic/shared_memory/CMakeLists.txt b/HIP-Basic/shared_memory/CMakeLists.txt new file mode 100644 index 00000000..49a91f20 --- /dev/null +++ b/HIP-Basic/shared_memory/CMakeLists.txt @@ -0,0 +1,59 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_shared_memory) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") + +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/shared_memory/Makefile b/HIP-Basic/shared_memory/Makefile new file mode 100644 index 00000000..36a7d271 --- /dev/null +++ b/HIP-Basic/shared_memory/Makefile @@ -0,0 +1,54 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := hip_shared_memory +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + CXXFLAGS += -x cu + CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp + $(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/HIP-Basic/shared_memory/README.md b/HIP-Basic/shared_memory/README.md new file mode 100644 index 00000000..9f05802f --- /dev/null +++ b/HIP-Basic/shared_memory/README.md @@ -0,0 +1,47 @@ +# HIP-Basic Shared Memory Example + +## Description +The shared memory is an on-chip type of memory that is visible to all the threads within the same block, allowing them to communicate by writing and reading data from the same memory space. However, some synchronization among the threads of the block is needed to ensure that all of them have written before trying to access the data. + +When using the appropriate access pattern, this memory can provide much less latency than local or global memory (nearly as much as registers), making it a much better option in certain cases. If the size of the shared memory to be used is known at compile time, it can be explicitly specified and it is then known as static shared memory. + +This example implements a simple matrix transpose kernel to showcase how to use static shared memory. + +### Application flow +1. A number of constants are defined for the kernel launch parameters. +2. The input and output matrices are allocated and initialized in host memory. +3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device. +4. A trace message is printed to the standard output. +5. The GPU kernel is then launched with the previously defined arguments. +6. The transposed matrix is copied back to host memory. +7. All device memory is freed. +8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output. + +## Key APIs and Concepts +- `__shared__` is a variable declaration specifier necessary to allocate shared memory from the device. +- `__syncthreads` allows to synchronize all the threads within the same block. This synchronization barrier is used to ensure that every thread in a block have finished writing in shared memory before another threads in the block try to access that data. +- `hipMalloc` allocates host device memory in global memory, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others. +- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU). +- `hipGetLastError` gets the last error returned by any HIP runtime API call. +- `hipFree` deallocates device memory allocated with `hipMalloc`. + +## Demonstrated API Calls + +### HIP runtime +- `__global__` +- `__shared__` + +#### Device symbols +- `blockDim` +- `blockIdx` +- `threadIdx` +- `__syncthreads` + +#### Host symbols +- `hipFree` +- `hipGetLastError` +- `hipLaunchKernelGGL` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` diff --git a/HIP-Basic/shared_memory/main.hip b/HIP-Basic/shared_memory/main.hip new file mode 100644 index 00000000..9fa15f20 --- /dev/null +++ b/HIP-Basic/shared_memory/main.hip @@ -0,0 +1,160 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include + +/// \brief Transposes the matrix \p in and stores the result in \p out using static shared memory. +template +__global__ void matrix_transpose_kernel(float* out, const float* in) +{ + // Allocate the necessary amount of shared memory to store the transpose of the matrix. + constexpr unsigned int size = Width * Width; + __shared__ float shared_matrix_memory[size]; + + // Compute the row and column indexes of the matrix element that each thread is going + // to process. + const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; + + // If not out of bounds, transpose element (x,y). + if(x < Width && y < Width) + { + // Store transposed element in shared memory. + shared_matrix_memory[y * Width + x] = in[x * Width + y]; + } + + // Syncronize threads so all writes are done before accessing shared memory again. + __syncthreads(); + + // If not out of bounds, transpose element (x,y). + if(x < Width && y < Width) + { + // Copy transposed element from shared memory to global memory. + out[y * Width + x] = shared_matrix_memory[y * Width + x]; + } +} + +// CPU implementation of matrix transpose. +std::vector expected_matrix_transpose(const std::vector& input, + const unsigned int width) +{ + std::vector output(width * width); + for(unsigned int j = 0; j < width; j++) + { + for(unsigned int i = 0; i < width; i++) + { + output[i * width + j] = input[j * width + i]; + } + } + return output; +} + +int main() +{ + // Number of rows and columns, total number of elements and size in bytes of the matrix + // to be transposed. + constexpr unsigned int width = 64; + constexpr unsigned int size = width * width; + constexpr unsigned int size_bytes = size * sizeof(float); + + // Number of threads in each dimension of the kernel block. + constexpr unsigned int block_size = 4; + + // Number of blocks in each dimension of the grid. Calculated as ceil(width/block_size). + constexpr unsigned int grid_size = (width + block_size - 1) / block_size; + + // Block and grid sizes in 2D. + constexpr dim3 block_dim(block_size, block_size); + constexpr dim3 grid_dim(grid_size, grid_size); + + // Allocate host input matrix and initialize with increasing sequence 10, 20, 30, .... + std::vector matrix(size); + std::iota(matrix.begin(), matrix.end(), 1.f); + std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; }); + + // Allocate matrix to store the results of the kernel execution. + std::vector transposed_matrix(size); + + // Allocate input and output matrices on device. + float* d_matrix{}; + float* d_transposed_matrix{}; + HIP_CHECK(hipMalloc(&d_matrix, size_bytes)); + HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes)); + + // Copy input matrix data from host to device. + HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice)); + + // Print trace message. + std::cout << "Computing matrix transpose." << std::endl; + + // Launch kernel on the default stream. Passing kernel arguments at the end of the + // hipLaunchKernelGGL function call. + hipLaunchKernelGGL(matrix_transpose_kernel, + grid_dim, + block_dim, + 0, + hipStreamDefault, + d_transposed_matrix, + d_matrix); + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + + // Copy results from device to host. + HIP_CHECK(hipMemcpy(transposed_matrix.data(), + d_transposed_matrix, + size_bytes, + hipMemcpyDeviceToHost)); + + // Free device memory. + HIP_CHECK(hipFree(d_matrix)); + HIP_CHECK(hipFree(d_transposed_matrix)); + + // Calculate expected transposed matrix with the CPU version of the kernel. + std::vector expected_transposed_matrix = expected_matrix_transpose(matrix, width); + + // Validate results comparing with expected transposed matrix. + unsigned int errors = 0; + constexpr float eps = 1.0E-6; + std::cout << "Validating transposed matrix." << std::endl; + for(unsigned int i = 0; i < size; i++) + { + errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps); + } + + if(errors) + { + std::cout << "Validation failed with " << errors << " errors." << std::endl; + return error_exit_code; + } + else + { + std::cout << "Validation passed." << std::endl; + } +} diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.sln b/HIP-Basic/shared_memory/shared_memory_vs2019.sln new file mode 100644 index 00000000..f256b16c --- /dev/null +++ b/HIP-Basic/shared_memory/shared_memory_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "shared_memory_vs2019", "shared_memory_vs2019.vcxproj", "{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.ActiveCfg = Debug|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.Build.0 = Debug|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.ActiveCfg = Release|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj new file mode 100644 index 00000000..8f74a594 --- /dev/null +++ b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj @@ -0,0 +1,99 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D} + Win32Proj + shared_memory_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters new file mode 100644 index 00000000..591e9f2c --- /dev/null +++ b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters @@ -0,0 +1,27 @@ + + + + + {2932a426-602b-4926-887e-27c50ba7eab7} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {ed043ec4-e8ac-4831-93f5-a58546ec7bea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {0da954bd-e555-4454-b082-b68d10c753b9} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff --git a/HIP-Basic/streams/streams_vs2019.vcxproj b/HIP-Basic/streams/streams_vs2019.vcxproj index 2b0e8932..50d5b2d3 100644 --- a/HIP-Basic/streams/streams_vs2019.vcxproj +++ b/HIP-Basic/streams/streams_vs2019.vcxproj @@ -52,9 +52,11 @@ true + hip_$(ProjectName) false + hip_$(ProjectName) gfx1030 @@ -67,6 +69,7 @@ Level1 __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 Console @@ -80,6 +83,7 @@ true __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 Console @@ -92,4 +96,4 @@ - \ No newline at end of file + diff --git a/HIP-Basic/warp_shuffle/.gitignore b/HIP-Basic/warp_shuffle/.gitignore new file mode 100644 index 00000000..561ef15b --- /dev/null +++ b/HIP-Basic/warp_shuffle/.gitignore @@ -0,0 +1 @@ +hip_warp_shuffle diff --git a/HIP-Basic/warp_shuffle/CMakeLists.txt b/HIP-Basic/warp_shuffle/CMakeLists.txt new file mode 100644 index 00000000..f8f8b666 --- /dev/null +++ b/HIP-Basic/warp_shuffle/CMakeLists.txt @@ -0,0 +1,58 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_warp_shuffle) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") +set(GPU_RUNTIMES "HIP" "CUDA") +set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) + +if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) + set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") + message(FATAL_ERROR ${ERROR_MESSAGE}) +endif() + +enable_language(${GPU_RUNTIME}) +set(CMAKE_${GPU_RUNTIME}_STANDARD 17) +set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) +set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) + +set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +if(NOT CMAKE_PREFIX_PATH) + set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") +endif() + +add_executable(${example_name} main.hip) +# Make example runnable using ctest. +add_test(${example_name} ${example_name}) + +set(include_dirs "../../Common") +# For examples targeting NVIDIA, include the HIP header directory. +if(GPU_RUNTIME STREQUAL "CUDA") + list(APPEND include_dirs "${ROCM_ROOT}/include") +endif() + +target_include_directories(${example_name} PRIVATE ${include_dirs}) +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) diff --git a/HIP-Basic/warp_shuffle/Makefile b/HIP-Basic/warp_shuffle/Makefile new file mode 100644 index 00000000..1143e9c4 --- /dev/null +++ b/HIP-Basic/warp_shuffle/Makefile @@ -0,0 +1,54 @@ +# MIT License +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := hip_warp_shuffle +COMMON_INCLUDE_DIR := ../../Common +GPU_RUNTIME := HIP + +# HIP variables +ROCM_INSTALL_DIR := /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +CXXFLAGS := -std=$(CXX_STD) +CPPFLAGS := -I $(COMMON_INCLUDE_DIR) +LDFLAGS := +LDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + CXXFLAGS += -x cu + CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp + $(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/HIP-Basic/warp_shuffle/README.md b/HIP-Basic/warp_shuffle/README.md new file mode 100644 index 00000000..691857d3 --- /dev/null +++ b/HIP-Basic/warp_shuffle/README.md @@ -0,0 +1,53 @@ +# HIP-Basic Warp Shuffle Example + +## Description +Kernel code for a particular block is executed in groups of threads known as a _wavefronts_ (AMD) or _warps_ (NVIDIA). Each block is is divided into as many warps as the block's size allows. If the block size is less than the warp size, then part of the warp just stays idle (as happens in this example). AMD GPUs use 64 threads per wavefront for architectures prior to RDNA™ 1. RDNA architectures support both 32 and 64 wavefront sizes. + +Warps are executed in _lockstep_, i.e. all the threads in each warp execute the same instruction at the same time but with different data. This type of parallel processing is also known as Single Instruction, Multiple Data (SIMD). A block contains several warps and the warp size is dependent on the architecture, but the block size is not. Blocks and warps also differ in the way they are executed, and thus they may provide different results when used in the same piece of code. For instance, the kernel code of this example would not work as it is with block execution and shared memory access e.g. because some synchronization would be needed to ensure that every thread has written its correspondent value before trying to access it. + +Higher performance in the execution of kernels can be achieved with explicit warp-level programming. This can be done by using some collective operations, known as _warp shuffles_, that allow exchanging data between threads in the same warp without the need for shared memory. This exchange occurs simultaneously for all the active threads in the warp. + +This example showcases how to use the above-mentioned operations by implementing a simple matrix transpose kernel. + +### Application flow +1. A number of constants are defined for the kernel launch parameters. +2. The input and output matrices are allocated and initialized in host memory. +3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device. +4. A trace message is printed to the standard output. +5. The GPU kernel is then launched with the previously defined arguments. +6. The transposed matrix is copied back to host memory. +7. All device memory is freed. +8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output. + +## Key APIs and Concepts +Warp shuffle is a warp-level primitive that allows for the communication between the threads of a warp. Below is a simple example that shows how the value of the thread with index 2 is copied to all other threads within the warp. +![warp_shuffle_simple.svg](warp_shuffle_simple.svg) + +`__shfl(var, src_lane, width = warp_size)` copies the value of a `var` from the thread `src_lane` within the warp. This operation admits a third parameter (not used in this example), `width`, defaulted to the warp size value and which allows restricting the number of threads of the warp from which values are read. Values are copied from threads with an ID in the range $[0, width-1]$. If the ID of the thread specified in the call to `__shfl` is out of that range, then the thread accessed is the one with that ID modulo `width`. The `src_lane` may also vary per thread, as shown below. + +![warp_shuffle.svg](warp_shuffle.svg) + +- `hipGetDeviceProperties` gets the properties of the specified device. In this example, it is used to get the warp size of the device (GPU) used. +- `hipMalloc` allocates memory in the global memory of the device, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU). +- `hipGetLastError` gets the last error returned by any HIP runtime API call. +- `hipFree` deallocates device memory allocated with `hipMalloc`. + +## Demonstrated API Calls + +### HIP runtime + +#### Device symbols +- `__global__` +- `threadIdx` +- `__shfl` + +#### Host symbols +- `hipFree` +- `hipGetDeviceProperties` +- `hipGetLastError` +- `hipLaunchKernelGGL` +- `hipMalloc` +- `hipMemcpy` +- `hipMemcpyDeviceToHost` +- `hipMemcpyHostToDevice` +- `hipStreamDefault` diff --git a/HIP-Basic/warp_shuffle/main.hip b/HIP-Basic/warp_shuffle/main.hip new file mode 100644 index 00000000..1174d924 --- /dev/null +++ b/HIP-Basic/warp_shuffle/main.hip @@ -0,0 +1,156 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "example_utils.hpp" + +#include + +#include +#include +#include +#include + +/// \brief Transposes the matrix \p in and stores the result in \p out using warp shuffle operations. +__global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width) +{ + // Compute the row and column indexes of the matrix element that each thread is going + // to process. Since in this example there is only one block, the indexes are + // precisely the thread's ID in each dimension. + const unsigned int x = threadIdx.x; + const unsigned int y = threadIdx.y; + + // If not out of bounds, transpose element. + if(x < width && y < width) + { + // Read element from global memory. Each thread in the warp is reading the element that + // the thread with global id x * width + y will transpose. + const float val = in[y * width + x]; + + // Transpose element reading it from the correspondent thread with a shuffle operation (__shfl). + // __shfl does not require all threads to be active, so it can be inside the if block. + // Note that, since the matrix in this example has less elements than the warp size value, + // the ID within the warp of each thread matches its global ID. + out[x * width + y] = __shfl(val, y * width + x); + } +} + +/// \brief CPU implementation of matrix transpose. +std::vector expected_matrix_transpose(const std::vector& input, + const unsigned int width) +{ + std::vector output(width * width); + for(unsigned int j = 0; j < width; j++) + { + for(unsigned int i = 0; i < width; i++) + { + output[i * width + j] = input[j * width + i]; + } + } + return output; +} + +int main() +{ + // Number of rows and columns, total number of elements and size in bytes of the matrix + // to be transposed. + constexpr unsigned int width = 4; + constexpr unsigned int size = width * width; + constexpr unsigned int size_bytes = size * sizeof(float); + + // Get device's warp size. + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0 /*device ID*/)); + + // To guarantee the correct behaviour of the program, keep total number of matrix elements + // below (or equal to) warp size. + assert(size <= props.warpSize + && "Matrix has more elements than architecture's warp size value."); + + // Block (2D) and grid sizes. Note that in this example we have only 1 block (and 1 warp). + constexpr dim3 block_dim(width, width); + constexpr dim3 grid_dim(1); + + // Allocate host input matrix and initialize with increasing sequence 10, 20, 30, .... + std::vector matrix(size); + std::iota(matrix.begin(), matrix.end(), 1.f); + std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; }); + + // Allocate matrix to store the results of the kernel execution. + std::vector transposed_matrix(size); + + // Allocate input and output matrices on device. + float* d_matrix{}; + float* d_transposed_matrix{}; + HIP_CHECK(hipMalloc(&d_matrix, size_bytes)); + HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes)); + + // Copy input matrix data from host to device. + HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice)); + + // Print trace message. + std::cout << "Computing matrix transpose." << std::endl; + + // Lauching kernel from host + hipLaunchKernelGGL(matrix_transpose_kernel, + grid_dim, + block_dim, + 0, + hipStreamDefault, + d_transposed_matrix, + d_matrix, + width); + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + + // Copy results from device to host. + HIP_CHECK(hipMemcpy(transposed_matrix.data(), + d_transposed_matrix, + size_bytes, + hipMemcpyDeviceToHost)); + + // Free device memory. + HIP_CHECK(hipFree(d_matrix)); + HIP_CHECK(hipFree(d_transposed_matrix)); + + // Calculate expected transposed matrix with the CPU version of the kernel. + std::vector expected_transposed_matrix = expected_matrix_transpose(matrix, width); + + // Validate results comparing with expected transposed matrix. + unsigned int errors = 0; + constexpr float eps = 1.0E-6; + std::cout << "Validating transposed matrix." << std::endl; + for(unsigned int i = 0; i < size; i++) + { + errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps); + } + + if(errors) + { + std::cout << "Validation failed with " << errors << " errors." << std::endl; + return error_exit_code; + } + else + { + std::cout << "Validation passed." << std::endl; + } +} diff --git a/HIP-Basic/warp_shuffle/warp_shuffle.svg b/HIP-Basic/warp_shuffle/warp_shuffle.svg new file mode 100644 index 00000000..d493ffcc --- /dev/null +++ b/HIP-Basic/warp_shuffle/warp_shuffle.svg @@ -0,0 +1,3 @@ + + +
xn
xn
0
0
1
1
2
2
3
3
warp_size - 1
warp_size - 1
lane
lane
ret = __shfl(var, thread_idx % 2)
ret = __shfl(var, thread_idx % 2)
var
var
x1
x1
x1
x1
ret
ret
x0
x0
x0
x0
x1
x1
x1
x1
x0
x0
x2
x2
x3
x3
...
...
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg b/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg new file mode 100644 index 00000000..87183fad --- /dev/null +++ b/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg @@ -0,0 +1,3 @@ + + +
xn
xn
0
0
1
1
2
2
3
3
warp_size - 1
warp_size - 1
lane
lane
ret = __shfl(var, 2)
ret = __shfl(var, 2)
var
var
x2
x2
x2
x2
ret
ret
x2
x2
x0
x0
x1
x1
x2
x2
x2
x2
x2
x2
x3
x3
...
...
...
...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln new file mode 100644 index 00000000..164f344b --- /dev/null +++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_shuffle_vs2019", "warp_shuffle_vs2019.vcxproj", "{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.ActiveCfg = Debug|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.Build.0 = Debug|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.ActiveCfg = Release|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0} + EndGlobalSection +EndGlobal diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj new file mode 100644 index 00000000..c56ab539 --- /dev/null +++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj @@ -0,0 +1,99 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045} + Win32Proj + warp_shuffle_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + stdcpp17 + $(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters new file mode 100644 index 00000000..591e9f2c --- /dev/null +++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters @@ -0,0 +1,27 @@ + + + + + {2932a426-602b-4926-887e-27c50ba7eab7} + cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu + + + {ed043ec4-e8ac-4831-93f5-a58546ec7bea} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh + + + {0da954bd-e555-4454-b082-b68d10c753b9} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff --git a/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj b/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj index dbe7b184..25cd7210 100644 --- a/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj +++ b/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj @@ -52,9 +52,11 @@ true + example_$(ProjectName) false + example_$(ProjectName) gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj b/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj index 72d44d3b..d3a64fb7 100644 --- a/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj +++ b/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj @@ -52,9 +52,11 @@ true + rocprim_$(ProjectName) false + rocprim_$(ProjectName) gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj b/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj index 3a4abace..584b7d7c 100644 --- a/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj +++ b/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj @@ -20,7 +20,7 @@ 15.0 {E71DB5FB-A1C4-4BB4-8B46-0037C32C885E} Win32Proj - example_template_vs2019 + device_sum_vs2019 10.0 @@ -52,9 +52,11 @@ true + rocprim_$(ProjectName) false + rocprim_$(ProjectName) gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/Libraries/rocRAND/simple_distributions_cpp/Makefile b/Libraries/rocRAND/simple_distributions_cpp/Makefile index 3956c5f6..f573aea4 100644 --- a/Libraries/rocRAND/simple_distributions_cpp/Makefile +++ b/Libraries/rocRAND/simple_distributions_cpp/Makefile @@ -26,7 +26,7 @@ CUDACXX = $(CUDA_INSTALL_DIR)/bin/nvcc CXX_STD = c++17 COMMON_INCLUDE_DIR = ../../../Common -rocrand_simple_distributions_cpp: main.cpp argument_parsing.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp +rocrand_simple_distributions_cpp: main.cpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp ifeq ($(GPU_RUNTIME), CUDA) $(CUDACXX) $< -std=$(CXX_STD) -isystem $(ROCM_INSTALL_DIR)/include -isystem $(CUDA_INSTALL_DIR)/include -I $(COMMON_INCLUDE_DIR) -L $(ROCM_INSTALL_DIR)/lib -L $(CUDA_INSTALL_DIR)/lib64 -lrocrand -lcudart -o $@ -D__HIP_PLATFORM_NVIDIA__ -x cu else diff --git a/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp b/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp deleted file mode 100644 index 4cb0f65a..00000000 --- a/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp +++ /dev/null @@ -1,174 +0,0 @@ -// MIT License -// -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP -#define SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP - -#include -#include -#include -#include -#include - -// Needed for the 's' suffix of `std::string` literals. -using namespace std::string_literals; - -/// \brief The random distribution kind selected on the command line. -enum class Distribution -{ - uniform_int, - uniform_real, - normal, - poisson -}; - -/// \brief The set of arguments parsed from the command line. -struct CliArguments -{ - int device_id_; - Distribution distribution_; - size_t size_; - bool print_; -}; - -/// \brief Operator overload to simply print a \p CliArguments instance. -std::ostream& operator<<(std::ostream& os, const CliArguments& cli_args) -{ - // An immediately-invoked lambda expression selects the name of the distribution. - const std::string_view distribution_name = [&]() - { - switch(cli_args.distribution_) - { - case Distribution::uniform_int: return "uniform_int"; - case Distribution::uniform_real: return "uniform_real"; - case Distribution::normal: return "normal"; - case Distribution::poisson: return "poisson"; - default: return "unknown"; - } - }(); - - // Printing the fields to the `std::ostream` object. - return os << "Selected device id: " << cli_args.device_id_ - << "\nSelected distribution: " << distribution_name - << "\nSelected size: " << cli_args.size_ << "\nPrinting results: " << std::boolalpha - << cli_args.print_; -} - -/// \brief Converts a \p std::string_view to integral type \p T. -// Throws an exception with an error message if the conversion is unsuccessful. -template -T parse_integral_arg(const std::string_view arg_value) -{ - T value; - // Try to convert the string_view to an integral type. If successful, the value is written to - // the variable `value` - const auto conversion_result - = std::from_chars(arg_value.data(), arg_value.data() + arg_value.size(), value); - // The default constructed `std::errc` stands for successful conversion. - if(conversion_result.ec != std::errc{}) - { - throw std::runtime_error( - "Could not convert argument \""s.append(arg_value).append("\" to an integral value")); - } - return value; -} - -/// \brief Parses an \p std::string_view to a \p Distribution. -/// Throws an exception with an error message if the conversion is unsuccessful. -Distribution parse_distribution_arg(const std::string_view distribution_arg) -{ - if(distribution_arg == "uniform_int") - { - return Distribution::uniform_int; - } - if(distribution_arg == "uniform_real") - { - return Distribution::uniform_real; - } - if(distribution_arg == "normal") - { - return Distribution::normal; - } - if(distribution_arg == "poisson") - { - return Distribution::poisson; - } - throw std::runtime_error( - "Argument \""s.append(distribution_arg).append("\" is not a valid distribution")); -} - -/// \brief Parses the array of command line arguments to parameters consumed by the rest -/// of the program. \p argc must be set to the size of the \p argv array. Each pointer in -/// the \p argv array must point to a valid null-terminated string containing the argument. -CliArguments parse_args(const int argc, const char** argv) -{ - // Pointers fulfill the random access iterator traits, thereby can be used with the - // standard algorithms. - const char** argv_end = argv + argc; - - // This local function searches for `arg_name` in the argument array and returns true if found. - const auto find_argument = [&](const std::string_view arg_name) - { - const auto arg_name_it = std::find(argv, argv_end, arg_name); - return arg_name_it != argv_end; - }; - - // This local function searches for `arg_name` in the argument array. If found, it returns a pointer - // to the next argument -- that is assumed to be the provided value. Otherwise returns a null optional. - // If the found argument is the last one, an exception with an error message is thrown. - const auto find_argument_value - = [&](const std::string_view arg_name) -> std::optional - { - const auto arg_name_it = std::find(argv, argv_end, arg_name); - if(arg_name_it == argv_end) - { - return std::nullopt; - } - // std::next returns the iterator copied and advanced by one - const auto arg_value_it = std::next(arg_name_it); - if(arg_value_it == argv_end) - { - throw std::runtime_error("Value for argument is not supplied: "s.append(arg_name)); - } - return std::make_optional(*arg_value_it); - }; - - // The options below need provided values, thereby `find_argument_value` is used. - const auto device_arg = find_argument_value("--device").value_or("0"); - const auto distribution_arg = find_argument_value("--distribution").value_or("uniform_int"); - const auto size_arg = find_argument_value("--size").value_or("10000000"); - - // The option below is just a flag. Its existence is checked by `find_argument`. - const bool print_arg = find_argument("--print"); - - // Parse the arguments read to the corresponding type and return. - return {parse_integral_arg(device_arg), - parse_distribution_arg(distribution_arg), - parse_integral_arg(size_arg), - print_arg}; -} - -constexpr std::string_view cli_usage_message - = "Usage: simple_distributions_cpp [--device ] [--distribution " - "{uniform_int|uniform_real|normal|poisson}] [--size ] [--print]"; - -#endif // SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP diff --git a/Libraries/rocRAND/simple_distributions_cpp/main.cpp b/Libraries/rocRAND/simple_distributions_cpp/main.cpp index 1f370ec8..0864c765 100644 --- a/Libraries/rocRAND/simple_distributions_cpp/main.cpp +++ b/Libraries/rocRAND/simple_distributions_cpp/main.cpp @@ -20,21 +20,22 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include -#include -#include -#include +#include "cmdparser.hpp" +#include "example_utils.hpp" #include + // Workaround for ROCm on Windows not including `__half` definitions, in a host compiler. #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP__) && (defined(WIN32) || defined(_WIN32)) #include #endif #include -#include "argument_parsing.hpp" -#include "example_utils.hpp" +#include +#include +#include +#include +#include // An anonymous namespace sets static linkage to its contents. // This means that the contained function definitions will only be visible @@ -42,6 +43,15 @@ namespace { +/// \brief The random distribution kind selected on the command line. +enum class Distribution +{ + uniform_int, + uniform_real, + normal, + poisson +}; + /// \brief Selects the device (GPU) with the provided ID. If it cannot be selected /// (e.g. a non-existent device ID is passed), an exception is thrown. /// Otherwise, the name of the device is queried and printed to the standard output. @@ -184,32 +194,90 @@ void dispatch_distribution_type(const Distribution dist, const size_t size, cons } } +void configure_parser(cli::Parser& parser) +{ + // Default parameters + parser.set_optional("device", "device", 0, + "Device Id"); // Default Device 0 + parser.set_optional("distribution", + "distribution", + "uniform_int", + "rocRAND distribution"); // Default "uniform_int" + parser.set_optional("size", "size", 10000000, + "Problem size"); // Default 10000000 + + parser.set_optional( + "print", + "print", + 0, + "Toggle printing on or off. This is a boolean argument and takes no value. If it is " + "provided the value is set to \"on\""); // Default "off" +} + +Distribution get_distribution(std::string distribution_arg) +{ + Distribution distribution_enum; + if(distribution_arg == "uniform_int") + { + distribution_enum = Distribution::uniform_int; + } + else if(distribution_arg == "uniform_real") + { + distribution_enum = Distribution::uniform_real; + } + else if(distribution_arg == "normal") + { + distribution_enum = Distribution::normal; + } + else if(distribution_arg == "poisson") + { + distribution_enum = Distribution::poisson; + } + else + { + std::cerr << distribution_arg << (" is not a valid distribution.") << std::endl; + exit(error_exit_code); + } + return distribution_enum; +} + } // namespace int main(const int argc, const char** argv) { - CliArguments args; - try + + // Get the number of hip devices in the system + int number_of_devies = 0; + HIP_CHECK(hipGetDeviceCount(&number_of_devies)) + + if(number_of_devies <= 0) { - // Parsing command line arguments. If something unexpected happens (e.g. missing arguments or - // wrong format), an exception is thrown. - args = parse_args(argc, argv); - // The parsed arguments are logged to the output to provide feedback to the user. - // For implementation, see `std::ostream& operator<<(std::ostream& os, const CliArguments& cli_args)` - std::cout << args << std::endl; + std::cerr << "HIP supported devices not found!" + << "\n"; + exit(error_exit_code); } - catch(const std::exception& ex) + + // Parse user inputs + cli::Parser parser(argc, argv); + configure_parser(parser); + parser.run_and_exit_if_error(); + + // Get user arguments, if provided. + const int device_id = parser.get("device"); + if(device_id < 0 || device_id >= number_of_devies) { - // The exception is caught, and an error message and the command line help is printed. - // The program returns with a non-zero exit code. - std::cerr << "Could not parse arguments. Error: "s.append(ex.what()) << "\n" - << cli_usage_message << std::endl; - return error_exit_code; + std::cerr << "Invalid device id " << device_id << "!\n" + << "Device does not exist\n"; + exit(error_exit_code); } + Distribution distribution = get_distribution(parser.get("distribution")); + size_t size = parser.get("size"); + bool print = parser.get("print"); + // Set up the used device (GPU) according to the command line supplied argument. - set_device(args.device_id_); + set_device(device_id); // Run the selected measurement on the device (GPU) and host (CPU). - dispatch_distribution_type(args.distribution_, args.size_, args.print_); + dispatch_distribution_type(distribution, size, print); } diff --git a/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj b/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj index f0df0007..a7664232 100644 --- a/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj +++ b/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj @@ -1,102 +1,104 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - 15.0 - {13bb009a-0679-49c0-a763-3f0a388ea78f} - Win32Proj - simple_distributions_cpp_vs2019 - 10.0 - - - - Application - true - HIP - Unicode - - - Application - false - HIP - true - Unicode - - - - - - - - - - - - - - - - true - rocrand_$(ProjectName) - - - false - rocrand_$(ProjectName) - - - gfx1030 - - - gfx1030 - - - - Level1 - __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories) - stdcpp17 - - - Console - true - rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - Level2 - true - true - __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories) - stdcpp17 - - - Console - true - true - true - rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - - + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + 15.0 + {13bb009a-0679-49c0-a763-3f0a388ea78f} + Win32Proj + simple_distributions_cpp_vs2019 + 10.0 + + + + Application + true + HIP + Unicode + + + Application + false + HIP + true + Unicode + + + + + + + + + + + + + + + + true + rocrand_$(ProjectName) + + + false + rocrand_$(ProjectName) + + + gfx1030 + + + gfx1030 + + + + Level1 + __HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + Level2 + true + true + __HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + true + true + rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + + \ No newline at end of file diff --git a/Libraries/rocThrust/norm/main.hip b/Libraries/rocThrust/norm/main.hip index 0b89fb63..2b286274 100644 --- a/Libraries/rocThrust/norm/main.hip +++ b/Libraries/rocThrust/norm/main.hip @@ -27,6 +27,7 @@ #include #include +#include #include "example_utils.hpp" @@ -70,4 +71,4 @@ int main() // print the Euclidean norm std::cout << "The Euclidean norm is: " << norm << std::endl; -} \ No newline at end of file +} diff --git a/Libraries/rocThrust/remove_points/Makefile b/Libraries/rocThrust/remove_points/Makefile index a21a9ef5..068b60ae 100644 --- a/Libraries/rocThrust/remove_points/Makefile +++ b/Libraries/rocThrust/remove_points/Makefile @@ -26,7 +26,7 @@ CXX_STD = c++17 COMMON_INCLUDE_DIR = ../../../Common rocthrust_remove_points: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp - $(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ + $(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ clean: rm -f rocthrust_remove_points diff --git a/Libraries/rocThrust/saxpy/Makefile b/Libraries/rocThrust/saxpy/Makefile index 887610db..1587cdcb 100644 --- a/Libraries/rocThrust/saxpy/Makefile +++ b/Libraries/rocThrust/saxpy/Makefile @@ -26,7 +26,7 @@ CXX_STD = c++17 COMMON_INCLUDE_DIR = ../../../Common rocthrust_saxpy: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp - $(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ + $(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ clean: rm -f rocthrust_saxpy diff --git a/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj b/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj index 169f0497..c12faddc 100644 --- a/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj +++ b/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj @@ -52,9 +52,11 @@ true + rocthrust_$(ProjectName) false + rocthrust_$(ProjectName) gfx1030 @@ -94,4 +96,4 @@ - \ No newline at end of file + diff --git a/Libraries/rocThrust/vectors/Makefile b/Libraries/rocThrust/vectors/Makefile index a441a896..cfc50037 100644 --- a/Libraries/rocThrust/vectors/Makefile +++ b/Libraries/rocThrust/vectors/Makefile @@ -26,7 +26,7 @@ CXX_STD = c++17 COMMON_INCLUDE_DIR = ../../../Common rocthrust_vectors: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp - $(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ + $(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@ clean: rm -f rocthrust_vectors diff --git a/README.md b/README.md index 80eb7538..55d7a3a1 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,24 @@ This project is currently unsupported and in an early testing stage. Feedback on ## Repository Contents - [Common](/Common/) contains common utility functionality shared between the examples. - [HIP-Basic](/HIP-Basic/) hosts self-contained recipes showcasing HIP runtime functionality. + - [assembly_to_executable](/HIP-Basic/assembly_to_executable): Program and accompanying build systems that show how to manually compile and link a HIP application from host and device code. + - [bandwidth](/HIP-Basic/bandwidth): Program that measures memory bandwidth from host to device, device to host, and device to device. - [device_query](/HIP-Basic/device_query): Program that showcases how properties from the device may be queried. - [dynamic_shared](/HIP-Basic/dynamic_shared): Program that showcases how to use dynamic shared memory with the help of a simple matrix transpose kernel. - [events](/HIP-Basic/events/): Measuring execution time and synchronizing with HIP events. - [hello_world](/HIP-Basic/hello_world): Simple program that showcases launching kernels and printing from the device. - [hipify](/HIP-Basic/hipify): Simple program and build definitions that showcase automatically converting a CUDA `.cu` source into portable HIP `.hip` source. + - [llvm_ir_to_executable](/HIP-Basic/llvm_ir_to_executable): Shows how to create a HIP executable from LLVM IR. - [matrix_multiplication](/HIP-Basic/matrix_multiplication/): Multiply two dynamically sized matrices utilizing shared memory. - [occupancy](/HIP-Basic/occupancy/): Shows how to find optimal configuation parameters for a kernel launch with maximum occupancy. + - [runtime_compilation](/HIP-Basic/runtime_compilation/): Simple program that showcases how to use HIP runtime compilation (hipRTC) to compile a kernel and launch it on a device. - [saxpy](/HIP-Basic/saxpy/): Implements the $Y_i=aX_i+Y_i$ kernel and explains basic HIP functionality. + - [shared_memory](/HIP-Basic/shared_memory/): Showcases how to use static shared memory by implementing a simple matrix transpose kernel. - [streams](/HIP-Basic/streams/): Program that showcases usage of multiple streams each with their own tasks. -- [Dockerfiles](/Dockerfiles/) hosts Dockerfiles with ready-to-use environments for the various samples. See [Dockerfiles/README.md](Dockerfiles/README.md) for details. -- [docs](/docs/) - - [CONTRIBUTING.md](docs/CONTRIBUTING.md) contains information on how to contribute to the examples. + - [warp_shuffle](/HIP-Basic/warp_shuffle/): Uses a simple matrix transpose kernel to showcase how to use warp shuffle operations. +- [Dockerfiles](/Dockerfiles/) hosts Dockerfiles with ready-to-use environments for the various samples. See [Dockerfiles/README.md](/Dockerfiles/README.md) for details. +- [Docs](/Docs/) + - [CONTRIBUTING.md](/Docs/CONTRIBUTING.md) contains information on how to contribute to the examples. - [Libraries](/Libraries/) - [hipCUB](/Libraries/hipCUB/) - [device_radix_sort](/Libraries/hipCUB/device_radix_sort/): Simple program that showcases `hipcub::DeviceRadixSort::SortPairs`. diff --git a/ROCm-Examples-VS2019.sln b/ROCm-Examples-VS2019.sln index 1e2cfc94..675964e0 100644 --- a/ROCm-Examples-VS2019.sln +++ b/ROCm-Examples-VS2019.sln @@ -1,4 +1,3 @@ - Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.32630.194 @@ -49,12 +48,24 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_multiplication_vs201 EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "occupancy_vs2019", "HIP-Basic\occupancy\occupancy_vs2019.vcxproj", "{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "runtime_compilation_vs2019", "HIP-Basic\runtime_compilation\runtime_compilation_vs2019.vcxproj", "{E03790B7-B203-4504-BEF5-F4F061183642}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dynamic_shared_vs2019", "HIP-Basic\dynamic_shared\dynamic_shared_vs2019.vcxproj", "{7B7D1745-7635-40DA-B6AF-B8F728A31124}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "shared_memory_vs2019", "HIP-Basic\shared_memory\shared_memory_vs2019.vcxproj", "{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "streams_vs2019", "HIP-Basic\streams\streams_vs2019.vcxproj", "{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "events_vs2019", "HIP-Basic\events\events_vs2019.vcxproj", "{5B822836-110B-44D8-8E02-2A9B2CB83D14}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "HIP-Basic\bandwidth\bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_shuffle_vs2019", "HIP-Basic\warp_shuffle\warp_shuffle_vs2019.vcxproj", "{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "HIP-Basic\assembly_to_executable\assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "HIP-Basic\llvm_ir_to_executable\llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -125,10 +136,18 @@ Global {E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Debug|x64.Build.0 = Debug|x64 {E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Release|x64.ActiveCfg = Release|x64 {E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Release|x64.Build.0 = Release|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.ActiveCfg = Debug|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.Build.0 = Debug|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.ActiveCfg = Release|x64 + {E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.Build.0 = Release|x64 {7B7D1745-7635-40DA-B6AF-B8F728A31124}.Debug|x64.ActiveCfg = Debug|x64 {7B7D1745-7635-40DA-B6AF-B8F728A31124}.Debug|x64.Build.0 = Debug|x64 {7B7D1745-7635-40DA-B6AF-B8F728A31124}.Release|x64.ActiveCfg = Release|x64 {7B7D1745-7635-40DA-B6AF-B8F728A31124}.Release|x64.Build.0 = Release|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.ActiveCfg = Debug|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.Build.0 = Debug|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.ActiveCfg = Release|x64 + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.Build.0 = Release|x64 {4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Debug|x64.ActiveCfg = Debug|x64 {4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Debug|x64.Build.0 = Debug|x64 {4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Release|x64.ActiveCfg = Release|x64 @@ -137,6 +156,22 @@ Global {5B822836-110B-44D8-8E02-2A9B2CB83D14}.Debug|x64.Build.0 = Debug|x64 {5B822836-110B-44D8-8E02-2A9B2CB83D14}.Release|x64.ActiveCfg = Release|x64 {5B822836-110B-44D8-8E02-2A9B2CB83D14}.Release|x64.Build.0 = Release|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64 + {16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.ActiveCfg = Debug|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.Build.0 = Debug|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.ActiveCfg = Release|x64 + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.Build.0 = Release|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64 + {60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64 + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -155,7 +190,7 @@ Global {631C61AA-52BA-4818-BD39-FA9CF47076C7} = {481D0AFC-64BC-436C-9FF5-7C07F9F8E4BD} {E1D552CF-3FE3-427A-95E1-8CFFB60BBF8E} = {481D0AFC-64BC-436C-9FF5-7C07F9F8E4BD} {0A489EDA-4BAD-4966-B439-37260D37D969} = {052412EF-7CEB-4E32-96F9-AADBC70945D7} - {B885EF49-EDAA-4474-8D31-E0EF71D2BB3D} = {0A489EDA-4BAD-4966-B439-37260D37D969} + {B885EF49-EDAA-4474-8D31-E0EF71D2BB3D} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {82BF226F-956B-4E2E-B295-71C17F33A5FB} = {052412EF-7CEB-4E32-96F9-AADBC70945D7} {E71DB5FB-A1C4-4BB4-8B46-0037C32C885E} = {82BF226F-956B-4E2E-B295-71C17F33A5FB} {65B21869-2BE2-4DA5-BEC5-28D1F910731C} = {82BF226F-956B-4E2E-B295-71C17F33A5FB} @@ -163,9 +198,15 @@ Global {D6334F08-D560-439A-A704-ADA0349D72B7} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {E03790B7-B203-4504-BEF5-F4F061183642} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {7B7D1745-7635-40DA-B6AF-B8F728A31124} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9} = {6EB7144D-2707-489E-A043-D59B7BE006D1} {5B822836-110B-44D8-8E02-2A9B2CB83D14} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {16B11B54-CD72-43B6-B226-38C668B41A79} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {5852BE0E-BDA5-4BD9-8A16-30E8E40F4045} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {60B4ADE0-8286-46AE-B884-5DA51B541DED} = {6EB7144D-2707-489E-A043-D59B7BE006D1} + {DBB8DFE9-CB1B-473C-937C-2A8120E0D819} = {6EB7144D-2707-489E-A043-D59B7BE006D1} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {90580497-38BF-428E-A951-6EC6CFC68193} diff --git a/scripts/code-format/check-format.sh b/Scripts/CodeFormat/check_format.sh similarity index 100% rename from scripts/code-format/check-format.sh rename to Scripts/CodeFormat/check_format.sh