Browse Source
* Resolve "Set Occupancy and Stream examples as C++ 17 standard in Windows Visual Studio" * Resolve "Fix Bug in Makefile of rocThrust vectors, remove_points and saxpy" * Fixing rebasing mistake in HIP-Basic events * Resolve "Port the command line parsing of rocRAND example and matrix multiplication to the common cmdparser.hpp" * Resolve "Bandwidth example" * Add required vulkan packages to rocm dockerfile * Fix the build type variable name in the cmake build on windows * increase CI docker build timeout * Add required vulkan packages to cuda dockerfile * fix vs build files for device query, matrix multiplication, device sum * fix target names in vcxproj * Runtime compilation example * Remove gfx90c target from saxpy * Fix README.md titles and typos in examples * Shared memory example * update docker files to rocm 5.3 * Resolve "2D Shuffle example" * Assembly to Executable * Resolve "LLVM IR to executable example" * Reordering some statements * update copyright of llvm ir to executable/assembly to executable * Resolve "Upstreaming examples fixes" Co-authored-by: Bibrak Qamar <bibrak@streamhpc.com> Co-authored-by: Lőrinc Serfőző <lorinc@streamhpc.com> Co-authored-by: Gergely Mészáros <gergely@streamhpc.com> Co-authored-by: Beatriz Navidad Vilches <beatriz@streamhpc.com> Co-authored-by: Saad Rahim <44449863+saadrahim@users.noreply.github.com>pull/12/head
105 changed files with 7396 additions and 642 deletions
@ -0,0 +1,768 @@
@@ -0,0 +1,768 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2015 - 2016 Florian Rappl
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
/*
|
||||
This file is part of the C++ CmdParser utility. |
||||
Copyright (c) 2015 - 2019 Florian Rappl |
||||
*/ |
||||
|
||||
#pragma once |
||||
#include <functional> |
||||
#include <iostream> |
||||
#include <sstream> |
||||
#include <stdexcept> |
||||
#include <string> |
||||
#include <vector> |
||||
|
||||
namespace cli |
||||
{ |
||||
/// Class used to wrap integer types to specify desired numerical base for specific argument parsing
|
||||
template<typename T, int numericalBase = 0> |
||||
class NumericalBase |
||||
{ |
||||
public: |
||||
/// This constructor required for correct AgrumentCountChecker initialization
|
||||
NumericalBase() : value(0), base(numericalBase) {} |
||||
|
||||
/// This constructor required for default value initialization
|
||||
/// \param val comes from default value
|
||||
NumericalBase(T val) : value(val), base(numericalBase) {} |
||||
|
||||
operator T() const |
||||
{ |
||||
return this->value; |
||||
} |
||||
operator T*() |
||||
{ |
||||
return this->value; |
||||
} |
||||
|
||||
T value; |
||||
unsigned int base; |
||||
}; |
||||
|
||||
struct CallbackArgs |
||||
{ |
||||
const std::vector<std::string>& arguments; |
||||
std::ostream& output; |
||||
std::ostream& error; |
||||
}; |
||||
class Parser |
||||
{ |
||||
private: |
||||
class CmdBase |
||||
{ |
||||
public: |
||||
explicit CmdBase(const std::string& name, |
||||
const std::string& alternative, |
||||
const std::string& description, |
||||
bool required, |
||||
bool dominant, |
||||
bool variadic) |
||||
: name(name) |
||||
, command(name.size() > 0 ? "-" + name : "") |
||||
, alternative(alternative.size() > 0 ? "--" + alternative : "") |
||||
, description(description) |
||||
, required(required) |
||||
, handled(false) |
||||
, arguments({}) |
||||
, dominant(dominant) |
||||
, variadic(variadic) |
||||
{} |
||||
|
||||
virtual ~CmdBase() {} |
||||
|
||||
std::string name; |
||||
std::string command; |
||||
std::string alternative; |
||||
std::string description; |
||||
bool required; |
||||
bool handled; |
||||
std::vector<std::string> arguments; |
||||
bool const dominant; |
||||
bool const variadic; |
||||
|
||||
virtual std::string print_value() const = 0; |
||||
virtual bool parse(std::ostream& output, std::ostream& error) = 0; |
||||
|
||||
bool is(const std::string& given) const |
||||
{ |
||||
return given == command || given == alternative; |
||||
} |
||||
}; |
||||
|
||||
template<typename T> |
||||
struct ArgumentCountChecker |
||||
{ |
||||
static constexpr bool Variadic = false; |
||||
}; |
||||
|
||||
template<typename T> |
||||
struct ArgumentCountChecker<cli::NumericalBase<T>> |
||||
{ |
||||
static constexpr bool Variadic = false; |
||||
}; |
||||
|
||||
template<typename T> |
||||
struct ArgumentCountChecker<std::vector<T>> |
||||
{ |
||||
static constexpr bool Variadic = true; |
||||
}; |
||||
|
||||
template<typename T> |
||||
class CmdFunction final : public CmdBase |
||||
{ |
||||
public: |
||||
explicit CmdFunction(const std::string& name, |
||||
const std::string& alternative, |
||||
const std::string& description, |
||||
bool required, |
||||
bool dominant) |
||||
: CmdBase(name, |
||||
alternative, |
||||
description, |
||||
required, |
||||
dominant, |
||||
ArgumentCountChecker<T>::Variadic) |
||||
{} |
||||
|
||||
virtual bool parse(std::ostream& output, std::ostream& error) |
||||
{ |
||||
try |
||||
{ |
||||
CallbackArgs args{arguments, output, error}; |
||||
value = callback(args); |
||||
return true; |
||||
} |
||||
catch(...) |
||||
{ |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
virtual std::string print_value() const |
||||
{ |
||||
return ""; |
||||
} |
||||
|
||||
std::function<T(CallbackArgs&)> callback; |
||||
T value; |
||||
}; |
||||
|
||||
template<typename T> |
||||
class CmdArgument final : public CmdBase |
||||
{ |
||||
public: |
||||
explicit CmdArgument(const std::string& name, |
||||
const std::string& alternative, |
||||
const std::string& description, |
||||
bool required, |
||||
bool dominant) |
||||
: CmdBase(name, |
||||
alternative, |
||||
description, |
||||
required, |
||||
dominant, |
||||
ArgumentCountChecker<T>::Variadic) |
||||
{} |
||||
|
||||
virtual bool parse(std::ostream&, std::ostream&) |
||||
{ |
||||
try |
||||
{ |
||||
value = Parser::parse(arguments, value); |
||||
return true; |
||||
} |
||||
catch(...) |
||||
{ |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
virtual std::string print_value() const |
||||
{ |
||||
return stringify(value); |
||||
} |
||||
|
||||
T value; |
||||
}; |
||||
|
||||
static int parse(const std::vector<std::string>& elements, const int&, int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stoi(elements[0], 0, numberBase); |
||||
} |
||||
|
||||
static bool parse(const std::vector<std::string>& elements, const bool& defval) |
||||
{ |
||||
if(elements.size() != 0) |
||||
throw std::runtime_error("A boolean command line parameter cannot have any arguments."); |
||||
|
||||
return !defval; |
||||
} |
||||
|
||||
static double parse(const std::vector<std::string>& elements, const double&) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stod(elements[0]); |
||||
} |
||||
|
||||
static float parse(const std::vector<std::string>& elements, const float&) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stof(elements[0]); |
||||
} |
||||
|
||||
static long double parse(const std::vector<std::string>& elements, const long double&) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stold(elements[0]); |
||||
} |
||||
|
||||
static unsigned int |
||||
parse(const std::vector<std::string>& elements, const unsigned int&, int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return static_cast<unsigned int>(std::stoul(elements[0], 0, numberBase)); |
||||
} |
||||
|
||||
static unsigned long |
||||
parse(const std::vector<std::string>& elements, const unsigned long&, int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stoul(elements[0], 0, numberBase); |
||||
} |
||||
|
||||
static unsigned long long parse(const std::vector<std::string>& elements, |
||||
const unsigned long long&, |
||||
int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stoull(elements[0], 0, numberBase); |
||||
} |
||||
|
||||
static long long |
||||
parse(const std::vector<std::string>& elements, const long long&, int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stoll(elements[0], 0, numberBase); |
||||
} |
||||
|
||||
static long parse(const std::vector<std::string>& elements, const long&, int numberBase = 0) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return std::stol(elements[0], 0, numberBase); |
||||
} |
||||
|
||||
static std::string parse(const std::vector<std::string>& elements, const std::string&) |
||||
{ |
||||
if(elements.size() != 1) |
||||
throw std::bad_cast(); |
||||
|
||||
return elements[0]; |
||||
} |
||||
|
||||
template<class T> |
||||
static std::vector<T> parse(const std::vector<std::string>& elements, const std::vector<T>&) |
||||
{ |
||||
const T defval = T(); |
||||
std::vector<T> values{}; |
||||
std::vector<std::string> buffer(1); |
||||
|
||||
for(const auto& element : elements) |
||||
{ |
||||
buffer[0] = element; |
||||
values.push_back(parse(buffer, defval)); |
||||
} |
||||
|
||||
return values; |
||||
} |
||||
|
||||
template<typename T> |
||||
static T parse(const std::vector<std::string>& elements, const NumericalBase<T>& wrapper) |
||||
{ |
||||
return parse(elements, wrapper.value, 0); |
||||
} |
||||
|
||||
/// Specialization for number wrapped into numerical base
|
||||
/// \tparam T base type of the argument
|
||||
/// \tparam base numerical base
|
||||
/// \param elements
|
||||
/// \param wrapper
|
||||
/// \return parsed number
|
||||
template<typename T, int base> |
||||
static T parse(const std::vector<std::string>& elements, const NumericalBase<T, base>& wrapper) |
||||
{ |
||||
return parse(elements, wrapper.value, wrapper.base); |
||||
} |
||||
|
||||
template<class T> |
||||
static std::string stringify(const T& value) |
||||
{ |
||||
return std::to_string(value); |
||||
} |
||||
|
||||
template<class T, int base> |
||||
static std::string stringify(const NumericalBase<T, base>& wrapper) |
||||
{ |
||||
return std::to_string(wrapper.value); |
||||
} |
||||
|
||||
template<class T> |
||||
static std::string stringify(const std::vector<T>& values) |
||||
{ |
||||
std::stringstream ss{}; |
||||
ss << "[ "; |
||||
|
||||
for(const auto& value : values) |
||||
{ |
||||
ss << stringify(value) << " "; |
||||
} |
||||
|
||||
ss << "]"; |
||||
return ss.str(); |
||||
} |
||||
|
||||
static std::string stringify(const std::string& str) |
||||
{ |
||||
return str; |
||||
} |
||||
|
||||
public: |
||||
explicit Parser(int argc, const char** argv) : _appname(argv[0]) |
||||
{ |
||||
for(int i = 1; i < argc; ++i) |
||||
{ |
||||
_arguments.push_back(argv[i]); |
||||
} |
||||
enable_help(); |
||||
} |
||||
|
||||
explicit Parser(int argc, char** argv) : _appname(argv[0]) |
||||
{ |
||||
for(int i = 1; i < argc; ++i) |
||||
{ |
||||
_arguments.push_back(argv[i]); |
||||
} |
||||
enable_help(); |
||||
} |
||||
|
||||
Parser(int argc, const char** argv, std::string generalProgramDescriptionForHelpText) |
||||
: _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText)) |
||||
{ |
||||
for(int i = 1; i < argc; ++i) |
||||
{ |
||||
_arguments.push_back(argv[i]); |
||||
} |
||||
enable_help(); |
||||
} |
||||
|
||||
Parser(int argc, char** argv, std::string generalProgramDescriptionForHelpText) |
||||
: _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText)) |
||||
{ |
||||
for(int i = 1; i < argc; ++i) |
||||
{ |
||||
_arguments.push_back(argv[i]); |
||||
} |
||||
enable_help(); |
||||
} |
||||
|
||||
~Parser() |
||||
{ |
||||
for(size_t i = 0, n = _commands.size(); i < n; ++i) |
||||
{ |
||||
delete _commands[i]; |
||||
} |
||||
} |
||||
|
||||
bool has_help() const |
||||
{ |
||||
for(const auto& command : _commands) |
||||
{ |
||||
if(command->name == "h" && command->alternative == "--help") |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
void enable_help() |
||||
{ |
||||
set_callback("h", |
||||
"help", |
||||
std::function<bool(CallbackArgs&)>( |
||||
[this](CallbackArgs& args) |
||||
{ |
||||
args.output << this->usage(); |
||||
#pragma warning(push) |
||||
#pragma warning(disable : 4702) |
||||
exit(0); |
||||
return false; |
||||
#pragma warning(pop) |
||||
}), |
||||
"", |
||||
true); |
||||
} |
||||
|
||||
void disable_help() |
||||
{ |
||||
for(auto command = _commands.begin(); command != _commands.end(); ++command) |
||||
{ |
||||
if((*command)->name == "h" && (*command)->alternative == "--help") |
||||
{ |
||||
_commands.erase(command); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> |
||||
void set_default(bool is_required, const std::string& description = "") |
||||
{ |
||||
auto command = new CmdArgument<T>{"", "", description, is_required, false}; |
||||
_commands.push_back(command); |
||||
} |
||||
|
||||
template<typename T> |
||||
void set_required(const std::string& name, |
||||
const std::string& alternative, |
||||
const std::string& description = "", |
||||
bool dominant = false) |
||||
{ |
||||
auto command = new CmdArgument<T>{name, alternative, description, true, dominant}; |
||||
_commands.push_back(command); |
||||
} |
||||
|
||||
template<typename T> |
||||
void set_optional(const std::string& name, |
||||
const std::string& alternative, |
||||
T defaultValue, |
||||
const std::string& description = "", |
||||
bool dominant = false) |
||||
{ |
||||
auto command = new CmdArgument<T>{name, alternative, description, false, dominant}; |
||||
command->value = defaultValue; |
||||
_commands.push_back(command); |
||||
} |
||||
|
||||
template<typename T> |
||||
void set_callback(const std::string& name, |
||||
const std::string& alternative, |
||||
std::function<T(CallbackArgs&)> callback, |
||||
const std::string& description = "", |
||||
bool dominant = false) |
||||
{ |
||||
auto command = new CmdFunction<T>{name, alternative, description, false, dominant}; |
||||
command->callback = callback; |
||||
_commands.push_back(command); |
||||
} |
||||
|
||||
inline void run_and_exit_if_error() |
||||
{ |
||||
if(run() == false) |
||||
{ |
||||
exit(1); |
||||
} |
||||
} |
||||
|
||||
inline bool run() |
||||
{ |
||||
return run(std::cout, std::cerr); |
||||
} |
||||
|
||||
inline bool run(std::ostream& output) |
||||
{ |
||||
return run(output, std::cerr); |
||||
} |
||||
|
||||
bool doesArgumentExist(std::string name, std::string altName) |
||||
{ |
||||
for(const auto& argument : _arguments) |
||||
{ |
||||
|
||||
if(argument == '-' + name || argument == altName) |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
inline bool doesHelpExist() |
||||
{ |
||||
return doesArgumentExist("h", "--help"); |
||||
} |
||||
|
||||
bool run(std::ostream& output, std::ostream& error) |
||||
{ |
||||
if(_arguments.size() > 0) |
||||
{ |
||||
auto current = find_default(); |
||||
|
||||
for(size_t i = 0, n = _arguments.size(); i < n; ++i) |
||||
{ |
||||
auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; |
||||
auto associated = isarg ? find(_arguments[i]) : nullptr; |
||||
|
||||
if(associated != nullptr) |
||||
{ |
||||
current = associated; |
||||
associated->handled = true; |
||||
} |
||||
else if(current == nullptr) |
||||
{ |
||||
error << no_default(); |
||||
return false; |
||||
} |
||||
else |
||||
{ |
||||
current->arguments.push_back(_arguments[i]); |
||||
current->handled = true; |
||||
if(!current->variadic) |
||||
{ |
||||
// If the current command is not variadic, then no more arguments
|
||||
// should be added to it. In this case, switch back to the default
|
||||
// command.
|
||||
current = find_default(); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// First, parse dominant arguments since they succeed even if required
|
||||
// arguments are missing.
|
||||
for(auto command : _commands) |
||||
{ |
||||
if(command->handled && command->dominant && !command->parse(output, error)) |
||||
{ |
||||
error << howto_use(command); |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
// Next, check for any missing arguments.
|
||||
for(auto command : _commands) |
||||
{ |
||||
if(command->required && !command->handled) |
||||
{ |
||||
error << howto_required(command); |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
// Finally, parse all remaining arguments.
|
||||
for(auto command : _commands) |
||||
{ |
||||
if(command->handled && !command->dominant && !command->parse(output, error)) |
||||
{ |
||||
error << howto_use(command); |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
|
||||
template<typename T> |
||||
T get(const std::string& name) const |
||||
{ |
||||
for(const auto& command : _commands) |
||||
{ |
||||
if(command->name == name) |
||||
{ |
||||
auto cmd = dynamic_cast<CmdArgument<T>*>(command); |
||||
|
||||
if(cmd == nullptr) |
||||
{ |
||||
throw std::runtime_error("Invalid usage of the parameter " + name |
||||
+ " detected."); |
||||
} |
||||
|
||||
return cmd->value; |
||||
} |
||||
} |
||||
|
||||
throw std::runtime_error("The parameter " + name + " could not be found."); |
||||
} |
||||
|
||||
template<typename T> |
||||
T get_if(const std::string& name, std::function<T(T)> callback) const |
||||
{ |
||||
auto value = get<T>(name); |
||||
return callback(value); |
||||
} |
||||
|
||||
int requirements() const |
||||
{ |
||||
int count = 0; |
||||
|
||||
for(const auto& command : _commands) |
||||
{ |
||||
if(command->required) |
||||
{ |
||||
++count; |
||||
} |
||||
} |
||||
|
||||
return count; |
||||
} |
||||
|
||||
int commands() const |
||||
{ |
||||
return static_cast<int>(_commands.size()); |
||||
} |
||||
|
||||
inline const std::string& app_name() const |
||||
{ |
||||
return _appname; |
||||
} |
||||
|
||||
protected: |
||||
CmdBase* find(const std::string& name) |
||||
{ |
||||
for(auto command : _commands) |
||||
{ |
||||
if(command->is(name)) |
||||
{ |
||||
return command; |
||||
} |
||||
} |
||||
|
||||
return nullptr; |
||||
} |
||||
|
||||
CmdBase* find_default() |
||||
{ |
||||
for(auto command : _commands) |
||||
{ |
||||
if(command->name == "") |
||||
{ |
||||
return command; |
||||
} |
||||
} |
||||
|
||||
return nullptr; |
||||
} |
||||
|
||||
std::string usage() const |
||||
{ |
||||
std::stringstream ss{}; |
||||
ss << _general_help_text << "\n\n"; |
||||
ss << "Available parameters:\n\n"; |
||||
|
||||
for(const auto& command : _commands) |
||||
{ |
||||
ss << " " << command->command << "\t" << command->alternative; |
||||
|
||||
if(command->required == true) |
||||
{ |
||||
ss << "\t(required)"; |
||||
} |
||||
|
||||
ss << "\n " << command->description; |
||||
|
||||
if(command->required == false) |
||||
{ |
||||
ss << "\n " |
||||
<< "This parameter is optional. The default value is '" + command->print_value() |
||||
<< "'."; |
||||
} |
||||
|
||||
ss << "\n\n"; |
||||
} |
||||
|
||||
return ss.str(); |
||||
} |
||||
|
||||
void print_help(std::stringstream& ss) const |
||||
{ |
||||
if(has_help()) |
||||
{ |
||||
ss << "For more help use --help or -h.\n"; |
||||
} |
||||
} |
||||
|
||||
std::string howto_required(CmdBase* command) const |
||||
{ |
||||
std::stringstream ss{}; |
||||
ss << "The parameter " << command->name << " is required.\n"; |
||||
ss << command->description << '\n'; |
||||
print_help(ss); |
||||
return ss.str(); |
||||
} |
||||
|
||||
std::string howto_use(CmdBase* command) const |
||||
{ |
||||
std::stringstream ss{}; |
||||
ss << "The parameter " << command->name << " has invalid arguments.\n"; |
||||
ss << command->description << '\n'; |
||||
print_help(ss); |
||||
return ss.str(); |
||||
} |
||||
|
||||
std::string no_default() const |
||||
{ |
||||
std::stringstream ss{}; |
||||
ss << "No default parameter has been specified.\n"; |
||||
ss << "The given argument must be used with a parameter.\n"; |
||||
print_help(ss); |
||||
return ss.str(); |
||||
} |
||||
|
||||
const std::string& get_general_help_text() const |
||||
{ |
||||
return _general_help_text; |
||||
} |
||||
|
||||
void set_general_help_text(const std::string& generalHelpText) |
||||
{ |
||||
_general_help_text = generalHelpText; |
||||
} |
||||
|
||||
private: |
||||
const std::string _appname; |
||||
std::string _general_help_text; |
||||
std::vector<std::string> _arguments; |
||||
std::vector<CmdBase*> _commands; |
||||
}; |
||||
} // namespace cli
|
@ -0,0 +1,3 @@
@@ -0,0 +1,3 @@
|
||||
hip_assembly_to_executable |
||||
*.o |
||||
*.hipfb |
@ -0,0 +1,174 @@
@@ -0,0 +1,174 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_assembly_to_executable) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
|
||||
# Only supported on HIP (not CUDA) |
||||
if(NOT "${GPU_RUNTIME}" STREQUAL "HIP") |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
if (NOT DEFINED CMAKE_HIP_ARCHITECTURES) |
||||
set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for") |
||||
else() |
||||
set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for") |
||||
endif() |
||||
|
||||
if(GPU_ARCHITECTURES STREQUAL "all") |
||||
set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE) |
||||
endif() |
||||
|
||||
# Remove duplicates. |
||||
list(REMOVE_DUPLICATES GPU_ARCHITECTURES) |
||||
message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}") |
||||
|
||||
set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only") |
||||
|
||||
if (WIN32) |
||||
set(OBJ_TYPE obj) |
||||
set(NULDEV NUL) |
||||
set(HOST_TARGET x86_64-pc-windows-msvc) |
||||
set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin) |
||||
else() |
||||
set(OBJ_TYPE o) |
||||
set(NULDEV /dev/null) |
||||
set(HOST_TARGET x86_64-unknown-linux) |
||||
set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin) |
||||
endif() |
||||
|
||||
# Assemble the device assemblies to object files using the HIP compiler. |
||||
# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file |
||||
# for the right GPU. |
||||
foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) |
||||
message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
add_custom_command( |
||||
OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE} |
||||
${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s |
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s |
||||
VERBATIM) |
||||
endforeach() |
||||
|
||||
# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool. |
||||
find_program( |
||||
OFFLOAD_BUNDLER_COMMAND clang-offload-bundler |
||||
PATH_SUFFIXES bin |
||||
PATHS |
||||
${ROCM_ROOT}/llvm |
||||
${CMAKE_INSTALL_PREFIX}/llvm |
||||
REQUIRED) |
||||
|
||||
if(OFFLOAD_BUNDLER_COMMAND) |
||||
message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}") |
||||
else() |
||||
message(FATAL_ERROR "clang-offload-bundler not found") |
||||
endif() |
||||
|
||||
# Generate object bundle. |
||||
# The invocation to generate is |
||||
# clang-offload-bundler -targets=<targets> -input=<input target #1> -inputs=<input target #2> ... -output=<output> |
||||
# Note that the host target must be the first target present here, and it should have an empty input associated to it. |
||||
|
||||
# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},... |
||||
set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}") |
||||
# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ... |
||||
set(BUNDLE_INPUTS "-input=${NULDEV}") |
||||
# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
set(BUNDLE_OBJECTS "") |
||||
foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) |
||||
set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}") |
||||
list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
endforeach() |
||||
|
||||
# Invoke clang-offload-bundler to generate an offload bundle. |
||||
set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb") |
||||
add_custom_command( |
||||
OUTPUT "${BUNDLE}" |
||||
COMMAND |
||||
"${OFFLOAD_BUNDLER_COMMAND}" |
||||
-type=o |
||||
-bundle-align=4096 |
||||
"${BUNDLE_TARGETS}" |
||||
${BUNDLE_INPUTS} |
||||
"-output=${BUNDLE}" |
||||
DEPENDS ${BUNDLE_OBJECTS} |
||||
VERBATIM) |
||||
|
||||
# Create the device binary by assembling the template that includes |
||||
# the offload bundle that was just generated using an .incbin directive. |
||||
# This needs an assembler. |
||||
find_program( |
||||
LLVM_MC_COMMAND llvm-mc |
||||
PATH_SUFFIXES bin |
||||
PATHS |
||||
${ROCM_ROOT}/llvm |
||||
${CMAKE_INSTALL_PREFIX}/llvm) |
||||
|
||||
if(LLVM_MC_COMMAND) |
||||
message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}") |
||||
else() |
||||
message(FATAL_ERROR "llvm-mc not found") |
||||
endif() |
||||
|
||||
# Invoke llvm-mc to generate an object file containing the offload bundle. |
||||
set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}") |
||||
add_custom_command( |
||||
OUTPUT "${DEVICE_OBJECT}" |
||||
COMMAND |
||||
"${LLVM_MC_COMMAND}" |
||||
-triple "${HOST_TARGET}" |
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}" |
||||
-o "${DEVICE_OBJECT}" |
||||
--filetype=obj |
||||
DEPENDS "${BUNDLE}" |
||||
VERBATIM) |
||||
|
||||
# Finally, create the executable. |
||||
add_executable( |
||||
${example_name} |
||||
main.hip |
||||
${DEVICE_OBJECT}) |
||||
|
||||
# Make example runnable using ctest. |
||||
add_test(${example_name} ${example_name}) |
||||
|
||||
set(include_dirs "../../Common") |
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,89 @@
@@ -0,0 +1,89 @@
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_assembly_to_executable |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME ?= HIP |
||||
|
||||
ifneq ($(GPU_RUNTIME), HIP) |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.) |
||||
endif |
||||
|
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
CLANG ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang |
||||
LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc |
||||
CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
# Compile for these GPU architectures
|
||||
HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030 |
||||
|
||||
# If white-space is given as a literal the `subst` cannot recognize it.
|
||||
# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS
|
||||
# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER
|
||||
# in the targets option. The targets option needs to be a single string with no spaces.
|
||||
empty = |
||||
space = $(empty) $(empty) |
||||
comma = , |
||||
|
||||
GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES)) |
||||
GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%)) |
||||
|
||||
all: $(EXAMPLE) |
||||
|
||||
$(EXAMPLE): main.o main_device.o |
||||
$(HIPCXX) -o $@ $^ |
||||
|
||||
main_device.o: hip_obj_gen.mcin offload_bundle.hipfb |
||||
$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj |
||||
|
||||
offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) |
||||
$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \
|
||||
-targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \
|
||||
-input=/dev/null \
|
||||
$(^:%=-input=%) \
|
||||
-output=$@ |
||||
|
||||
main.o: main.hip |
||||
$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $< |
||||
|
||||
main_%.o: main_%.s |
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $< |
||||
|
||||
clean: |
||||
rm -f \
|
||||
main_*.o \
|
||||
offload_bundle.hipfb \
|
||||
main_device.o \
|
||||
main.o \
|
||||
$(EXAMPLE) |
||||
|
||||
.PHONY: clean $(EXAMPLE) |
@ -0,0 +1,115 @@
@@ -0,0 +1,115 @@
|
||||
# HIP-Basic Assembly to Executable Example |
||||
|
||||
## Description |
||||
This example shows how to manually compile and link a HIP application from device assembly. Pre-generated assembly files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable. |
||||
|
||||
Building HIP executables from device assembly can be useful for example to experiment with specific instructions, perform specific optimizations, or can help debugging. |
||||
|
||||
### Building |
||||
|
||||
- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma. |
||||
```shell |
||||
make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" |
||||
``` |
||||
- Build with CMake: |
||||
```shell |
||||
cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" |
||||
cmake --build build |
||||
``` |
||||
On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"` |
||||
|
||||
## Generating device assembly |
||||
This example creates a HIP file from device assembly code, however, such assembly files can also be created from HIP source code using `hipcc`. This can be done by passing `-S` and `--cuda-device-only` to hipcc. The former flag instructs the compiler to generate human-readable assembly instead of machine code, and the latter instruct the compiler to only compile the device part of the program. The six assembly files for this example were generated as follows: |
||||
```shell |
||||
$ROCM_INSTALL_DIR/bin/hipcc -S --cuda-device-only --offload-arch=gfx803 --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 main.hip |
||||
``` |
||||
|
||||
The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=<arch>:<feature>+` to enable it or `--offload-arch=<arch>:<feature>-` to disable it. Multiple features may be present, separated by colons. |
||||
|
||||
## Build Process |
||||
A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device assembly files and host HIP code according to the following process: |
||||
|
||||
1. The `main.hip` file is compiled to an object file that only contains host code with `hipcc` by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead. |
||||
```shell |
||||
$ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip |
||||
``` |
||||
|
||||
2. Each device assembly file is compiled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`: |
||||
|
||||
```shell |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.s -o main_gfx1030.o |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=<arch> main_<arch>.s -o main_<arch>.o |
||||
... |
||||
``` |
||||
|
||||
3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `<offload-kind>-<target>-<arch>`. For HIP device code, `<offload-kind>` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `<offload-kind>` `host`. The order of targets and inputs must match. `<target>` is an LLVM target triple, which is specified as `<isa>-<vendor>-<os>-<abi>`. `<abi>` is left empty for AMD targets. |
||||
|
||||
```shell |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \ |
||||
-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \ |
||||
-input=/dev/null \ |
||||
-input=main_gfx1030.o -input=... \ |
||||
-output=offload_bundle.hipfb |
||||
``` |
||||
|
||||
Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling to assembly. |
||||
|
||||
4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive: |
||||
```nasm |
||||
.type __hip_fatbin,@object |
||||
; Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"a",@progbits |
||||
; Make the symbol that addresses the binary public |
||||
.globl __hip_fatbin |
||||
; Give the bundle the required alignment |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
; Include the binary |
||||
.incbin "offload_bundle.hipfb" |
||||
``` |
||||
This file can then be assembled using `llvm-mc` as follows: |
||||
``` |
||||
$ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple <host target> -o main_device.o hip_obj_gen.mcin --filetype=obj |
||||
``` |
||||
|
||||
5. Finally, using the system linker, hipcc, or clang, the host object and device objects are linked into an executable: |
||||
```shell |
||||
<ROCM_PATH>/hip/bin/hipcc -o hip_assembly_to_executable main.o main_device.o |
||||
``` |
||||
|
||||
### Visual Studio 2019 |
||||
The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools: |
||||
- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`: |
||||
``` |
||||
Additional Options: --cuda-host-only |
||||
``` |
||||
- Each device assembly .s file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section: |
||||
``` |
||||
Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a |
||||
Description: Compiling Device Assembly %(Identity) |
||||
Output: $(IntDir)%(FileName).o |
||||
Execute Before: ClCompile |
||||
``` |
||||
- Steps 3 and 4 are implemented using a custom build step: |
||||
``` |
||||
Command Line: |
||||
"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
Description: Generating Device Offload Object |
||||
Outputs: $(IntDIr)main_device.obj |
||||
Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) |
||||
Execute Before: ClCompile |
||||
``` |
||||
- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`: |
||||
``` |
||||
Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies) |
||||
``` |
||||
|
||||
## Used API surface |
||||
### HIP runtime |
||||
- `hipFree` |
||||
- `hipGetDeviceProperties` |
||||
- `hipGetLastError` |
||||
- `hipLaunchKernelGGL` |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
@ -0,0 +1,183 @@
@@ -0,0 +1,183 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip"> |
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions> |
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions> |
||||
</ClCompile> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<CustomBuild Include="hip_obj_gen_win.mcin"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command> |
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message> |
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command> |
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message> |
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx1030.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx803.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx900.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx906.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx908.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx90a.s"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command> |
||||
</CustomBuild> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{60b4ade0-8286-46ae-b884-5da51b541ded}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>assembly_to_executable_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
<CustomBuild> |
||||
<Message>Compiling Device Assembly %(Identity)</Message> |
||||
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command> |
||||
<Outputs>$(IntDir)%(FileName).o</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuildStep> |
||||
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Message>Generating Device Offload Object</Message> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Outputs>$(IntDIr)main_device.obj</Outputs> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs> |
||||
</CustomBuildStep> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
<CustomBuild> |
||||
<Message>Compiling Device Assembly %(Identity)</Message> |
||||
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command> |
||||
<Outputs>$(IntDir)%(FileName).o</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuildStep> |
||||
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Message>Generating Device Offload Object</Message> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Outputs>$(IntDIr)main_device.obj</Outputs> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs> |
||||
</CustomBuildStep> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -0,0 +1,21 @@
@@ -0,0 +1,21 @@
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||||
# See https://llvm.org/LICENSE.txt for license information. |
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||||
|
||||
# HIP Object Generator |
||||
# Use this generator to create a host bundled object file |
||||
# with the input of an offload bundled fat binary. |
||||
# |
||||
# Input: Bundled Object file .hipfb file |
||||
# Output: Host Bundled Object File .o |
||||
|
||||
.type __hip_fatbin,@object |
||||
# Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"a",@progbits |
||||
# Make the symbol that addresses the binary public. |
||||
.globl __hip_fatbin |
||||
# Give the bundle the required alignment of 4096 (2 ^ 12). |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
# Include the offload bundle. |
||||
.incbin "offload_bundle.hipfb" |
@ -0,0 +1,20 @@
@@ -0,0 +1,20 @@
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||||
# See https://llvm.org/LICENSE.txt for license information. |
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||||
|
||||
# HIP Object Generator |
||||
# Use this generator to create a host bundled object file |
||||
# with the input of an offload bundled fat binary. |
||||
# |
||||
# Input: Bundled Object file .hipfb file |
||||
# Output: Host Bundled Object File .o |
||||
|
||||
# Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"dw" |
||||
# Make the symbol that addresses the binary public. |
||||
.globl __hip_fatbin |
||||
# Give the bundle the required alignment of 4096 (2 ^ 12). |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
# Include the offload bundle. |
||||
.incbin "offload_bundle.hipfb" |
@ -0,0 +1,118 @@
@@ -0,0 +1,118 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
|
||||
#include <cstdlib> |
||||
#include <iostream> |
||||
#include <vector> |
||||
|
||||
/// \brief Device function to square each element |
||||
/// in the array `in` and write to array `out`. |
||||
template<typename T> |
||||
__global__ void vector_square_kernel(T* out, const T* in, const long long size) |
||||
{ |
||||
// Get the unique global thread ID |
||||
const size_t offset = blockIdx.x * blockDim.x + threadIdx.x; |
||||
// Each thread hops stride amount of elements to find the next |
||||
// element to square |
||||
const size_t stride = blockDim.x * gridDim.x; |
||||
|
||||
for(size_t i = offset; i < size; i += stride) |
||||
{ |
||||
out[i] = in[i] * in[i]; |
||||
} |
||||
} |
||||
|
||||
int main() |
||||
{ |
||||
// Set the problem size |
||||
constexpr size_t size = 1000000; |
||||
constexpr size_t size_in_bytes = size * sizeof(float); |
||||
|
||||
hipDeviceProp_t props; |
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/)); |
||||
std::cout << "info: running on device " << props.name << "\n"; |
||||
|
||||
std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) " |
||||
<< "\n"; |
||||
|
||||
// Declare the host side arrays |
||||
std::vector<float> h_in(size); |
||||
std::vector<float> h_out(size); |
||||
|
||||
// Initialize the host size input |
||||
for(size_t i = 0; i < size; i++) |
||||
{ |
||||
h_in[i] = 1.618f + i; |
||||
} |
||||
|
||||
// Declare the device side arrays |
||||
float *d_in, *d_out; |
||||
std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n"; |
||||
// Allocate the device side memory |
||||
HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); |
||||
HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); |
||||
|
||||
std::cout << "info: copy Host2Device\n"; |
||||
|
||||
// Copy the input from host to the GPU device |
||||
HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Set the number of blocks per kernel grid. |
||||
constexpr unsigned int grid_size = 512; |
||||
// Set the number of threads per kernel block. |
||||
constexpr unsigned int threads_per_block = 256; |
||||
|
||||
std::cout << "info: launch 'vector_square_kernel' kernel\n"; |
||||
hipLaunchKernelGGL(vector_square_kernel, |
||||
grid_size, |
||||
threads_per_block, |
||||
0, |
||||
hipStreamDefault, |
||||
d_out, |
||||
d_in, |
||||
size); |
||||
|
||||
// Check that the kernel invocation was successful. |
||||
HIP_CHECK(hipGetLastError()); |
||||
|
||||
std::cout << "info: copy Device2Host\n"; |
||||
HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost)); |
||||
|
||||
HIP_CHECK(hipFree(d_in)); |
||||
HIP_CHECK(hipFree(d_out)); |
||||
|
||||
std::cout << "info: check result\n"; |
||||
for(size_t i = 0; i < size; i++) |
||||
{ |
||||
if(h_out[i] != h_in[i] * h_in[i]) |
||||
{ |
||||
std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i] |
||||
<< ", expected: " << h_in[i] * h_in[i] << '\n'; |
||||
exit(error_exit_code); |
||||
} |
||||
} |
||||
std::cout << "PASSED!\n"; |
||||
} |
@ -0,0 +1,219 @@
@@ -0,0 +1,219 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx1030" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[2:3], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_nc_u32_e32 v0, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] |
||||
s_and_saveexec_b32 s0, vcc_lo |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s8, s[4:5], 0xc |
||||
s_load_dwordx4 s[4:7], s[6:7], 0x0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b32 s9, 0 |
||||
s_mov_b32 s1, s9 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[10:11], s[8:9], 2 |
||||
.p2align 6
|
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_add_co_u32 v4, vcc_lo, s6, v2 |
||||
v_add_co_ci_u32_e32 v5, vcc_lo, s7, v3, vcc_lo |
||||
v_add_co_u32 v0, vcc_lo, v0, s8 |
||||
v_add_co_ci_u32_e32 v1, vcc_lo, s9, v1, vcc_lo |
||||
global_load_dword v6, v[4:5], off |
||||
v_add_co_u32 v4, vcc_lo, s4, v2 |
||||
v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo |
||||
v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1] |
||||
v_add_co_u32 v2, s0, v2, s10 |
||||
v_add_co_ci_u32_e64 v3, s0, s11, v3, s0 |
||||
s_or_b32 s1, vcc_lo, s1 |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
global_store_dword v[4:5], v6, off |
||||
s_andn2_b32 exec_lo, exec_lo, s1 |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_wavefront_size32 1
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 7
|
||||
.amdhsa_next_free_sgpr 12
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 3
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_workgroup_processor_mode 1
|
||||
.amdhsa_memory_ordered 1
|
||||
.amdhsa_forward_progress 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 188
|
||||
; NumSgprs: 14
|
||||
; NumVgprs: 7
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 240
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 1
|
||||
; VGPRBlocks: 0
|
||||
; NumSGPRsForWavesPerEU: 14
|
||||
; NumVGPRsForWavesPerEU: 7
|
||||
; Occupancy: 16
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
.text |
||||
.p2alignl 6, 3214868480 |
||||
.fill 48, 4, 3214868480 |
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 14 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 7 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 32 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx1030 |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1,214 @@
@@ -0,0 +1,214 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx803" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[10:11], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_u32_e32 v0, vcc, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc, s[10:11], v[0:1] |
||||
s_and_saveexec_b64 s[0:1], vcc |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s8, s[4:5], 0xc |
||||
s_load_dwordx4 s[4:7], s[6:7], 0x0 |
||||
s_mov_b32 s9, 0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b64 s[14:15], 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[12:13], s[8:9], 2 |
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_mov_b32_e32 v5, s7 |
||||
v_add_u32_e32 v4, vcc, s6, v2 |
||||
v_addc_u32_e32 v5, vcc, v5, v3, vcc |
||||
flat_load_dword v6, v[4:5] |
||||
v_mov_b32_e32 v5, s5 |
||||
v_mov_b32_e32 v7, s9 |
||||
v_add_u32_e32 v0, vcc, s8, v0 |
||||
v_mov_b32_e32 v8, s13 |
||||
v_add_u32_e64 v4, s[0:1], s4, v2 |
||||
v_add_u32_e64 v2, s[2:3], s12, v2 |
||||
v_addc_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
||||
v_addc_u32_e32 v1, vcc, v1, v7, vcc |
||||
v_addc_u32_e64 v3, vcc, v3, v8, s[2:3] |
||||
v_cmp_le_u64_e32 vcc, s[10:11], v[0:1] |
||||
s_or_b64 s[14:15], vcc, s[14:15] |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
flat_store_dword v[4:5], v6 |
||||
s_andn2_b64 exec, exec, s[14:15] |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 9
|
||||
.amdhsa_next_free_sgpr 16
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 0
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 200
|
||||
; NumSgprs: 18
|
||||
; NumVgprs: 9
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 192
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 2
|
||||
; VGPRBlocks: 2
|
||||
; NumSGPRsForWavesPerEU: 18
|
||||
; NumVGPRsForWavesPerEU: 9
|
||||
; Occupancy: 10
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 18 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 9 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 64 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx803 |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1,216 @@
@@ -0,0 +1,216 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx900" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[12:13], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_u32_e32 v0, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_and_saveexec_b64 s[0:1], vcc |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s14, s[4:5], 0xc |
||||
s_load_dwordx4 s[8:11], s[6:7], 0x0 |
||||
s_mov_b32 s15, 0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b64 s[6:7], 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[4:5], s[14:15], 2 |
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_mov_b32_e32 v5, s11 |
||||
v_add_co_u32_e32 v4, vcc, s10, v2 |
||||
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc |
||||
global_load_dword v6, v[4:5], off |
||||
v_mov_b32_e32 v5, s9 |
||||
v_mov_b32_e32 v7, s15 |
||||
v_add_co_u32_e32 v0, vcc, s14, v0 |
||||
v_mov_b32_e32 v8, s5 |
||||
v_add_co_u32_e64 v4, s[0:1], s8, v2 |
||||
v_add_co_u32_e64 v2, s[2:3], s4, v2 |
||||
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
||||
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc |
||||
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] |
||||
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_or_b64 s[6:7], vcc, s[6:7] |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
global_store_dword v[4:5], v6, off |
||||
s_andn2_b64 exec, exec, s[6:7] |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 9
|
||||
.amdhsa_next_free_sgpr 16
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 3
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 200
|
||||
; NumSgprs: 18
|
||||
; NumVgprs: 9
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 240
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 2
|
||||
; VGPRBlocks: 2
|
||||
; NumSGPRsForWavesPerEU: 18
|
||||
; NumVGPRsForWavesPerEU: 9
|
||||
; Occupancy: 10
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 18 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 9 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 64 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx900 |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1,216 @@
@@ -0,0 +1,216 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx906" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[12:13], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_u32_e32 v0, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_and_saveexec_b64 s[0:1], vcc |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s14, s[4:5], 0xc |
||||
s_load_dwordx4 s[8:11], s[6:7], 0x0 |
||||
s_mov_b32 s15, 0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b64 s[6:7], 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[4:5], s[14:15], 2 |
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_mov_b32_e32 v5, s11 |
||||
v_add_co_u32_e32 v4, vcc, s10, v2 |
||||
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc |
||||
global_load_dword v6, v[4:5], off |
||||
v_mov_b32_e32 v5, s9 |
||||
v_mov_b32_e32 v7, s15 |
||||
v_add_co_u32_e32 v0, vcc, s14, v0 |
||||
v_mov_b32_e32 v8, s5 |
||||
v_add_co_u32_e64 v4, s[0:1], s8, v2 |
||||
v_add_co_u32_e64 v2, s[2:3], s4, v2 |
||||
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
||||
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc |
||||
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] |
||||
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_or_b64 s[6:7], vcc, s[6:7] |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
global_store_dword v[4:5], v6, off |
||||
s_andn2_b64 exec, exec, s[6:7] |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 9
|
||||
.amdhsa_next_free_sgpr 16
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 3
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 200
|
||||
; NumSgprs: 18
|
||||
; NumVgprs: 9
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 240
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 2
|
||||
; VGPRBlocks: 2
|
||||
; NumSGPRsForWavesPerEU: 18
|
||||
; NumVGPRsForWavesPerEU: 9
|
||||
; Occupancy: 10
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 18 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 9 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 64 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx906 |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1,218 @@
@@ -0,0 +1,218 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx908" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[12:13], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_u32_e32 v0, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_and_saveexec_b64 s[0:1], vcc |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s14, s[4:5], 0xc |
||||
s_load_dwordx4 s[8:11], s[6:7], 0x0 |
||||
s_mov_b32 s15, 0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b64 s[6:7], 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[4:5], s[14:15], 2 |
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_mov_b32_e32 v5, s11 |
||||
v_add_co_u32_e32 v4, vcc, s10, v2 |
||||
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc |
||||
global_load_dword v6, v[4:5], off |
||||
v_mov_b32_e32 v5, s9 |
||||
v_mov_b32_e32 v7, s15 |
||||
v_add_co_u32_e32 v0, vcc, s14, v0 |
||||
v_mov_b32_e32 v8, s5 |
||||
v_add_co_u32_e64 v4, s[0:1], s8, v2 |
||||
v_add_co_u32_e64 v2, s[2:3], s4, v2 |
||||
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
||||
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc |
||||
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] |
||||
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_or_b64 s[6:7], vcc, s[6:7] |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
global_store_dword v[4:5], v6, off |
||||
s_andn2_b64 exec, exec, s[6:7] |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 9
|
||||
.amdhsa_next_free_sgpr 16
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 3
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 200
|
||||
; NumSgprs: 18
|
||||
; NumVgprs: 9
|
||||
; NumAgprs: 0
|
||||
; TotalNumVgprs: 9
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 240
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 2
|
||||
; VGPRBlocks: 2
|
||||
; NumSGPRsForWavesPerEU: 18
|
||||
; NumVGPRsForWavesPerEU: 9
|
||||
; Occupancy: 10
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 18 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 9 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 64 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx908 |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1,226 @@
@@ -0,0 +1,226 @@
|
||||
.text |
||||
.amdgcn_target "amdgcn-amd-amdhsa--gfx90a" |
||||
.protected _Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.globl _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.p2align 8
|
||||
.type _Z20vector_square_kernelIfEvPT_PKS0_x,@function
|
||||
_Z20vector_square_kernelIfEvPT_PKS0_x: ; @_Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
; %bb.0:
|
||||
s_load_dword s0, s[4:5], 0x4 |
||||
s_load_dwordx2 s[12:13], s[6:7], 0x10 |
||||
v_mov_b32_e32 v1, 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_and_b32 s0, s0, 0xffff |
||||
s_mul_i32 s8, s8, s0 |
||||
v_add_u32_e32 v0, s8, v0 |
||||
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_and_saveexec_b64 s[0:1], vcc |
||||
s_cbranch_execz BB0_3 |
||||
; %bb.1:
|
||||
s_load_dword s14, s[4:5], 0xc |
||||
s_load_dwordx4 s[8:11], s[6:7], 0x0 |
||||
s_mov_b32 s15, 0 |
||||
v_lshlrev_b64 v[2:3], 2, v[0:1] |
||||
s_mov_b64 s[6:7], 0 |
||||
s_waitcnt lgkmcnt(0) |
||||
s_lshl_b64 s[4:5], s[14:15], 2 |
||||
BB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
v_mov_b32_e32 v5, s11 |
||||
v_add_co_u32_e32 v4, vcc, s10, v2 |
||||
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc |
||||
global_load_dword v6, v[4:5], off |
||||
v_mov_b32_e32 v5, s9 |
||||
v_mov_b32_e32 v7, s15 |
||||
v_add_co_u32_e32 v0, vcc, s14, v0 |
||||
v_mov_b32_e32 v8, s5 |
||||
v_add_co_u32_e64 v4, s[0:1], s8, v2 |
||||
v_add_co_u32_e64 v2, s[2:3], s4, v2 |
||||
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
||||
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc |
||||
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] |
||||
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] |
||||
s_or_b64 s[6:7], vcc, s[6:7] |
||||
s_waitcnt vmcnt(0) |
||||
v_mul_f32_e32 v6, v6, v6 |
||||
global_store_dword v[4:5], v6, off |
||||
s_andn2_b64 exec, exec, s[6:7] |
||||
s_cbranch_execnz BB0_2 |
||||
BB0_3: |
||||
s_endpgm |
||||
.section .rodata,#alloc |
||||
.p2align 6
|
||||
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_kernarg_size 80
|
||||
.amdhsa_user_sgpr_private_segment_buffer 1
|
||||
.amdhsa_user_sgpr_dispatch_ptr 1
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 1
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_next_free_vgpr 9
|
||||
.amdhsa_next_free_sgpr 16
|
||||
.amdhsa_accum_offset 12
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 3
|
||||
.amdhsa_float_denorm_mode_16_64 3
|
||||
.amdhsa_dx10_clamp 1
|
||||
.amdhsa_ieee_mode 1
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_tg_split 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.end_amdhsa_kernel |
||||
.text |
||||
.Lfunc_end0: |
||||
.size _Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x |
||||
; -- End function
|
||||
.section .AMDGPU.csdata |
||||
; Kernel info:
|
||||
; codeLenInByte = 200
|
||||
; NumSgprs: 18
|
||||
; NumVgprs: 9
|
||||
; NumAgprs: 0
|
||||
; TotalNumVgprs: 9
|
||||
; ScratchSize: 0
|
||||
; MemoryBound: 0
|
||||
; FloatMode: 240
|
||||
; IeeeMode: 1
|
||||
; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
; SGPRBlocks: 2
|
||||
; VGPRBlocks: 1
|
||||
; NumSGPRsForWavesPerEU: 18
|
||||
; NumVGPRsForWavesPerEU: 9
|
||||
; AccumOffset: 12
|
||||
; Occupancy: 8
|
||||
; WaveLimiterHint : 1
|
||||
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
|
||||
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
||||
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
||||
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
||||
; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
|
||||
; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
|
||||
.text |
||||
.p2alignl 6, 3212836864 |
||||
.fill 256, 4, 3212836864 |
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
||||
|
||||
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
|
||||
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
||||
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
||||
.zero 1
|
||||
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
||||
|
||||
.ident "AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)" |
||||
.section ".note.GNU-stack" |
||||
.addrsig |
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
|
||||
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
|
||||
.amdgpu_metadata |
||||
--- |
||||
amdhsa.kernels: |
||||
- .args: |
||||
- .address_space: global |
||||
.offset: 0 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .address_space: global |
||||
.offset: 8 |
||||
.size: 8 |
||||
.value_kind: global_buffer |
||||
- .offset: 16 |
||||
.size: 8 |
||||
.value_kind: by_value |
||||
- .offset: 24 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_x |
||||
- .offset: 32 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_y |
||||
- .offset: 40 |
||||
.size: 8 |
||||
.value_kind: hidden_global_offset_z |
||||
- .address_space: global |
||||
.offset: 48 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 56 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 64 |
||||
.size: 8 |
||||
.value_kind: hidden_none |
||||
- .address_space: global |
||||
.offset: 72 |
||||
.size: 8 |
||||
.value_kind: hidden_multigrid_sync_arg |
||||
.group_segment_fixed_size: 0 |
||||
.kernarg_segment_align: 8 |
||||
.kernarg_segment_size: 80 |
||||
.language: OpenCL C |
||||
.language_version: |
||||
- 2 |
||||
- 0 |
||||
.max_flat_workgroup_size: 1024 |
||||
.name: _Z20vector_square_kernelIfEvPT_PKS0_x |
||||
.private_segment_fixed_size: 0 |
||||
.sgpr_count: 18 |
||||
.sgpr_spill_count: 0 |
||||
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_x.kd |
||||
.vgpr_count: 9 |
||||
.vgpr_spill_count: 0 |
||||
.wavefront_size: 64 |
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx90a |
||||
amdhsa.version: |
||||
- 1 |
||||
- 1 |
||||
... |
||||
|
||||
.end_amdgpu_metadata |
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
hip_bandwidth |
@ -0,0 +1,56 @@
@@ -0,0 +1,56 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_bandwidth) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
set(GPU_RUNTIMES "HIP" "CUDA") |
||||
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) |
||||
|
||||
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
add_executable(${example_name} main.hip) |
||||
# Make example runnable using ctest |
||||
add_test(${example_name} ${example_name}) |
||||
set(include_dirs "../../Common") |
||||
if(GPU_RUNTIME STREQUAL "CUDA") |
||||
list(APPEND include_dirs "${ROCM_ROOT}/include") |
||||
endif() |
||||
|
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,54 @@
@@ -0,0 +1,54 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_bandwidth |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME := HIP |
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
ifeq ($(GPU_RUNTIME), CUDA) |
||||
CXXFLAGS += -x cu |
||||
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) |
||||
else ifeq ($(GPU_RUNTIME), HIP) |
||||
else |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) |
||||
endif |
||||
|
||||
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp |
||||
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ |
||||
|
||||
clean: |
||||
$(RM) $(EXAMPLE) |
||||
|
||||
.PHONY: clean |
@ -0,0 +1,28 @@
@@ -0,0 +1,28 @@
|
||||
# Cookbook Bandwidth Example |
||||
|
||||
## Description |
||||
This example measures the memory bandwith capacity of GPU devices. It performs memcpy from host to GPU device, GPU device to host, and within a single GPU. |
||||
|
||||
### Application flow |
||||
1. User commandline arguments are parsed and test parameters initialized. If there are no commandline arguments then the test paramenters are initialized with default values. |
||||
2. Bandwidth tests are launched. |
||||
3. If the memory type for the test set to `-memory pageable` then the host side data is instantiated in `std::vector<unsigned char>`. If the memory type for the test set to `-memory pinned` then the host side data is instantiated in `unsigned char*` and allocated using `hipHostMalloc`. |
||||
4. Device side storage is allocated using `hipMalloc` in `unsigned char*` |
||||
5. Memory transfer is performed `trail` amount of times using `hipMemcpy` for pageable memory or using `hipMemcpyAsync` for host allocated pinned memory. |
||||
6. Time of memory transfer operations is measured that is then used to calculate the bandwidth. |
||||
9. All device memory is freed using `hipFree` and all host allocated pinned memory is freed using `hipHostFree`. |
||||
|
||||
## Key APIs and Concepts |
||||
The program uses HIP pageable and pinned memory. It is important to note that the pinned memory is allocated using `hipHostMalloc` and is destroyed using `hipHostFree`. The HIP memory transfer routine `hipMemcpyAsync` will behave synchronously if the host memory is not pinned. Therefore, it is important to allocate pinned host memory using `hipHostMalloc` for `hipMemcpyAsync` to behave asynchronously. |
||||
|
||||
## Demonstrated API Calls |
||||
### HIP runtime |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
||||
- `hipMemcpyAsync` |
||||
- `hipGetDeviceCount` |
||||
- `hipGetDeviceProperties` |
||||
- `hipFree` |
||||
- `hipHostFree` |
||||
- `hipHostMalloc` |
||||
- `hipSetDevice` |
@ -0,0 +1,102 @@
@@ -0,0 +1,102 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\cmdparser.hpp" /> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{16b11b54-cd72-43b6-b226-38c668b41a79}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>bandwidth_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -0,0 +1,637 @@
@@ -0,0 +1,637 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "cmdparser.hpp" |
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
|
||||
#include <iostream> |
||||
#include <map> |
||||
#include <numeric> |
||||
#include <vector> |
||||
|
||||
// Paged or pinned host memory |
||||
enum class MemoryMode : unsigned int |
||||
{ |
||||
PAGED, |
||||
PINNED |
||||
}; |
||||
|
||||
// Test either ranges of inputs sizes with a constant increament |
||||
// or a more complex shmoo test that tests bandwidth for large number of varying sizes. |
||||
enum class TestMode : unsigned int |
||||
{ |
||||
RANGED, |
||||
SHMOO |
||||
}; |
||||
|
||||
/// \brief Run host to device or device to host transfer, bandwidth calculated for the specified configuration |
||||
std::vector<double> |
||||
run_bandwidth_host_device(const std::vector<unsigned long>& memory_copy_measurement_sizes, |
||||
const int device, |
||||
hipMemcpyKind hip_memcpy_kind, |
||||
const MemoryMode memory_mode, |
||||
const unsigned int trails) |
||||
{ |
||||
|
||||
// Check for invalid configurations |
||||
if(hip_memcpy_kind == hipMemcpyDeviceToDevice) |
||||
{ |
||||
std::cerr << "hipMemcpyDeviceToDevice is an invalid Configuration\n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
// The bandwidths calculated will be stored in bandwidth_measurements |
||||
std::vector<double> bandwidth_measurements; |
||||
|
||||
// Flush buffer for CPU cache |
||||
constexpr size_t flush_size = 256 * 1024 * 1024; |
||||
std::vector<char> flush_buffer(flush_size); |
||||
|
||||
HIP_CHECK(hipSetDevice(device)); |
||||
|
||||
if(hip_memcpy_kind == hipMemcpyHostToDevice) |
||||
{ |
||||
std::cout << "Measuring Host to Device Bandwidth: " << std::flush; |
||||
} |
||||
else |
||||
{ |
||||
std::cout << "Measuring Device to Host Bandwidth: " << std::flush; |
||||
} |
||||
|
||||
for(auto size : memory_copy_measurement_sizes) |
||||
{ |
||||
std::cout << "[" << size << "] " << std::flush; |
||||
|
||||
// Blocks used to clear host cache |
||||
const unsigned long long cache_clear_size = 1 << 24; |
||||
std::vector<unsigned char> h_cache_block_1(cache_clear_size); |
||||
std::vector<unsigned char> h_cache_block_2(cache_clear_size); |
||||
|
||||
// Size in bytes |
||||
const size_t size_in_bytes = sizeof(unsigned char) * size; |
||||
|
||||
// Allocate device input memory |
||||
unsigned char* d_in = nullptr; |
||||
HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); |
||||
|
||||
// Memory transfer from host to device |
||||
if(memory_mode == MemoryMode::PAGED) |
||||
{ |
||||
// Host input memory |
||||
std::vector<unsigned char> h_in(size); |
||||
|
||||
// Host output memory |
||||
std::vector<unsigned char> h_out(size); |
||||
|
||||
// Initialize the host input memory |
||||
for(unsigned int i = 0; i < size; i++) |
||||
{ |
||||
h_in[i] = static_cast<unsigned char>(i & 0xff); |
||||
} |
||||
|
||||
unsigned char* src = nullptr; |
||||
unsigned char* dst = nullptr; |
||||
|
||||
switch(hip_memcpy_kind) |
||||
{ |
||||
case hipMemcpyHostToDevice: |
||||
// Set the source and destination for hipMemcpy |
||||
src = h_in.data(); |
||||
dst = d_in; |
||||
break; |
||||
case hipMemcpyDeviceToHost: |
||||
// Transfer the host input to device |
||||
HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Set the source and destination for hipMemcpy |
||||
src = d_in; |
||||
dst = h_out.data(); |
||||
break; |
||||
default: |
||||
std::cerr << "Invalid memcpy kind " << hip_memcpy_kind << "! \n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
// Fill the host cache clear buffers |
||||
for(unsigned int i = 0; i < h_cache_block_1.size(); i++) |
||||
{ |
||||
h_cache_block_1[i] = static_cast<unsigned char>(i & 0xff); |
||||
h_cache_block_2[i] = static_cast<unsigned char>(0xff - (i & 0xff)); |
||||
} |
||||
|
||||
// Timer class |
||||
HostClock host_clock; |
||||
|
||||
// Perform memory transfers warm up |
||||
for(unsigned int i = 0; i < 5; i++) |
||||
{ |
||||
// Initiate the memory transfer |
||||
HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind)); |
||||
|
||||
// Flush the buffer |
||||
memset(flush_buffer.data(), i, flush_buffer.size()); |
||||
} |
||||
|
||||
// Perform memory transfers for trails number of times |
||||
for(unsigned int i = 0; i < trails; i++) |
||||
{ |
||||
host_clock.start_timer(); |
||||
|
||||
// Initiate the memory transfer |
||||
HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind)); |
||||
|
||||
host_clock.stop_timer(); |
||||
|
||||
// Flush the buffer |
||||
memset(flush_buffer.data(), i, flush_buffer.size()); |
||||
} |
||||
// Calculate the bandwith in GB/s |
||||
const double bandwidth_achieved |
||||
= ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); |
||||
|
||||
bandwidth_measurements.emplace_back(bandwidth_achieved); |
||||
} |
||||
else if(memory_mode == MemoryMode::PINNED) // Pinned memory mode |
||||
{ |
||||
// Host input memory |
||||
unsigned char* h_in = nullptr; |
||||
|
||||
// Host output memory |
||||
unsigned char* h_out = nullptr; |
||||
|
||||
HIP_CHECK(hipHostMalloc(&h_in, size_in_bytes)); |
||||
HIP_CHECK(hipHostMalloc(&h_out, size_in_bytes)); |
||||
|
||||
// Initialize the host memory |
||||
for(unsigned int i = 0; i < size; i++) |
||||
{ |
||||
h_in[i] = static_cast<unsigned char>(i & 0xff); |
||||
} |
||||
|
||||
unsigned char* src = nullptr; |
||||
unsigned char* dst = nullptr; |
||||
|
||||
if(hip_memcpy_kind == hipMemcpyHostToDevice) |
||||
{ |
||||
// Set the source and destination for hipMemcpy |
||||
src = h_in; |
||||
dst = d_in; |
||||
} |
||||
else if(hip_memcpy_kind == hipMemcpyDeviceToHost) |
||||
{ |
||||
// Transfer the host input to device |
||||
HIP_CHECK(hipMemcpyAsync(d_in, h_in, size_in_bytes, hip_memcpy_kind)); |
||||
HIP_CHECK(hipDeviceSynchronize()); |
||||
|
||||
// Set the source and destination for hipMemcpy |
||||
src = d_in; |
||||
dst = h_out; |
||||
} |
||||
|
||||
// Perform memory transfers warm up |
||||
for(unsigned int i = 0; i < 5; i++) |
||||
{ |
||||
HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind)); |
||||
} |
||||
HIP_CHECK(hipDeviceSynchronize()); |
||||
|
||||
HostClock host_clock; |
||||
host_clock.start_timer(); |
||||
|
||||
// Initiate the memory transfer |
||||
// Perform memory transfers for trails number of times |
||||
for(unsigned int i = 0; i < trails; i++) |
||||
{ |
||||
HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind)); |
||||
} |
||||
|
||||
HIP_CHECK(hipDeviceSynchronize()); |
||||
|
||||
host_clock.stop_timer(); |
||||
|
||||
// Calculate the bandwith in GB/s |
||||
const double bandwidth_achieved |
||||
= ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); |
||||
|
||||
bandwidth_measurements.emplace_back(bandwidth_achieved); |
||||
|
||||
HIP_CHECK(hipHostFree(h_in)); |
||||
HIP_CHECK(hipHostFree(h_out)); |
||||
} |
||||
|
||||
// Free the memory |
||||
HIP_CHECK(hipFree(d_in)); |
||||
} |
||||
std::cout << std::endl; |
||||
|
||||
return bandwidth_measurements; |
||||
} |
||||
|
||||
/// \brief Run device to device transfer, bandwidth calculated for the specified configuration |
||||
std::vector<double> |
||||
run_bandwidth_device_device(std::vector<unsigned long> memory_copy_measurement_sizes, |
||||
const int device, |
||||
const unsigned int trails) |
||||
{ |
||||
|
||||
// The bandwidths calculated will be stored in bandwidth_measurements |
||||
std::vector<double> bandwidth_measurements; |
||||
|
||||
HIP_CHECK(hipSetDevice(device)); |
||||
|
||||
std::cout << "Measuring Device to Device Bandwith: " << std::flush; |
||||
for(auto size : memory_copy_measurement_sizes) |
||||
{ |
||||
std::cout << "[" << size << "] " << std::flush; |
||||
|
||||
// Size in bytes |
||||
const size_t size_in_bytes = sizeof(unsigned char) * size; |
||||
|
||||
// Allocate device input memory |
||||
unsigned char* d_in = nullptr; |
||||
HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); |
||||
|
||||
// Device output memory. |
||||
unsigned char* d_out = nullptr; |
||||
HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); |
||||
|
||||
// Host input memory |
||||
std::vector<unsigned char> h_in(size); |
||||
|
||||
// Initialize the host input memory |
||||
for(unsigned int i = 0; i < size; i++) |
||||
{ |
||||
h_in[i] = static_cast<unsigned char>(i & 0xff); |
||||
} |
||||
|
||||
// Transfer the host input to device |
||||
HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Set the source and destination for hipMemcpy |
||||
unsigned char* src = d_in; |
||||
unsigned char* dst = d_out; |
||||
|
||||
// Perform memory transfers warm up |
||||
for(unsigned int i = 0; i < 5; i++) |
||||
{ |
||||
// Initiate the memory transfer |
||||
HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice)); |
||||
} |
||||
|
||||
// Synchronize because the device to device memory copy is non-blocking |
||||
HIP_CHECK(hipDeviceSynchronize()); |
||||
|
||||
// Timer class |
||||
HostClock host_clock; |
||||
host_clock.start_timer(); |
||||
|
||||
// Perform memory transfers for trails number of times |
||||
for(unsigned int i = 0; i < trails; i++) |
||||
{ |
||||
// Initiate the memory transfer |
||||
HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice)); |
||||
} |
||||
HIP_CHECK(hipDeviceSynchronize()); |
||||
|
||||
host_clock.stop_timer(); |
||||
|
||||
// Calculate the bandwith in GB/s |
||||
const double bandwidth_achieved |
||||
= ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time(); |
||||
|
||||
bandwidth_measurements.emplace_back(bandwidth_achieved); |
||||
|
||||
// Free the device output memory |
||||
HIP_CHECK(hipFree(d_out)); |
||||
|
||||
// Free the memory |
||||
HIP_CHECK(hipFree(d_in)); |
||||
} |
||||
std::cout << std::endl; |
||||
|
||||
return bandwidth_measurements; |
||||
} |
||||
|
||||
std::vector<unsigned long> |
||||
generate_measurement_sizes_range(const size_t start_measurement, |
||||
const size_t end_measurement, |
||||
const size_t stride_between_measurements) |
||||
{ |
||||
// The size of data to copy for each measurement |
||||
std::vector<unsigned long> memory_copy_measurement_sizes; |
||||
|
||||
for(size_t i = start_measurement; i < end_measurement; i += stride_between_measurements) |
||||
{ |
||||
memory_copy_measurement_sizes.emplace_back(i); |
||||
} |
||||
|
||||
return memory_copy_measurement_sizes; |
||||
} |
||||
|
||||
std::vector<unsigned long> generate_measurement_sizes_shmoo() |
||||
{ |
||||
|
||||
// Constants for shmoo mode |
||||
const size_t shmoo_memsize_max = 1 << 26; // 64 MB |
||||
|
||||
const size_t shmoo_increment_1KB = 1 << 10; // 1 KB |
||||
const size_t shmoo_increment_2KB = 1 << 11; // 2 KB |
||||
const size_t shmoo_increment_10KB = shmoo_increment_1KB * 10; // 10KB |
||||
const size_t shmoo_increment_100KB = shmoo_increment_10KB * 10; // 100 KB |
||||
const size_t shmoo_increment_1MB = 1 << 20; // 1 MB |
||||
const size_t shmoo_increment_2MB = 1 << 21; // 2 MB |
||||
const size_t shmoo_increment_4MB = 1 << 22; // 4 MB |
||||
|
||||
const size_t shmoo_limit_20KB = shmoo_increment_10KB * 2; // 20 KB |
||||
const size_t shmoo_limit_50KB = shmoo_increment_10KB * 5; // 50 KB |
||||
const size_t shmoo_limit_100KB = shmoo_increment_10KB * 10; // 100 KB |
||||
const size_t shmoo_limit_1MB = 1 << 20; // 1 MB |
||||
const size_t shmoo_limit_16MB = 1 << 24; // 16 MB |
||||
const size_t shmoo_limit_32MB = 1 << 25; // 32 MB |
||||
|
||||
// The size of data to copy for each measurement |
||||
std::vector<unsigned long> memory_copy_measurement_sizes; |
||||
|
||||
size_t current_size = 0; |
||||
|
||||
while(current_size <= shmoo_memsize_max) |
||||
{ |
||||
if(current_size < shmoo_limit_20KB) |
||||
{ |
||||
current_size += shmoo_increment_1KB; |
||||
} |
||||
else if(current_size < shmoo_limit_50KB) |
||||
{ |
||||
current_size += shmoo_increment_2KB; |
||||
} |
||||
else if(current_size < shmoo_limit_100KB) |
||||
{ |
||||
current_size += shmoo_increment_10KB; |
||||
} |
||||
else if(current_size < shmoo_limit_1MB) |
||||
{ |
||||
current_size += shmoo_increment_100KB; |
||||
} |
||||
else if(current_size < shmoo_limit_16MB) |
||||
{ |
||||
current_size += shmoo_increment_1MB; |
||||
} |
||||
else if(current_size < shmoo_limit_32MB) |
||||
{ |
||||
current_size += shmoo_increment_2MB; |
||||
} |
||||
else |
||||
{ |
||||
current_size += shmoo_increment_4MB; |
||||
} |
||||
memory_copy_measurement_sizes.emplace_back(current_size); |
||||
} |
||||
|
||||
return memory_copy_measurement_sizes; |
||||
} |
||||
|
||||
void configure_parser(cli::Parser& parser) |
||||
{ |
||||
// Default parameters |
||||
parser.set_optional<size_t>("start", "start", 1 << 20, "Starting size"); // Default 1 MB |
||||
parser.set_optional<size_t>("end", "end", 1 << 23, "Ending size"); // Default 8 MB |
||||
parser.set_optional<size_t>("stride", |
||||
"stride", |
||||
1 << 22, // Default 4 MB |
||||
"Stride (or increament) between sizes"); |
||||
|
||||
parser.set_optional<std::string>("mode", |
||||
"mode", |
||||
"range", |
||||
"Mode of bandwidth test: range or shmoo"); |
||||
parser.set_optional<std::string>("memory", |
||||
"memory", |
||||
"pageable", |
||||
"Memory allocation kind: pageable or pinned\n"); |
||||
parser.set_optional<size_t>("trials", "trials", 50, "Number of trials"); |
||||
parser.set_optional<std::vector<std::string>>( |
||||
"device", |
||||
"device", |
||||
{"0"}, |
||||
"Space-separated list of devices\n" |
||||
"\tall for using all the available devices\n" |
||||
"\t0,1,2,...,n for using any particular available devices"); |
||||
parser.set_optional<std::vector<std::string>>("memcpy", |
||||
"memcpy", |
||||
{"htod", "dtoh", "dtod"}, |
||||
"Space-separated list of memory copy kind.\n" |
||||
"\thtod is host to device\n" |
||||
"\tdtoh is device to host\n" |
||||
"\tdtod is device to device"); |
||||
} |
||||
|
||||
int main(int argc, char** argv) |
||||
{ |
||||
|
||||
// Get the number of hip devices in the system |
||||
int number_of_devices = 0; |
||||
HIP_CHECK(hipGetDeviceCount(&number_of_devices)) |
||||
|
||||
if(number_of_devices <= 0) |
||||
{ |
||||
std::cerr << "HIP supported devices not found!" |
||||
<< "\n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
// Parse user inputs |
||||
cli::Parser parser(argc, argv); |
||||
configure_parser(parser); |
||||
parser.run_and_exit_if_error(); |
||||
|
||||
// Set configurations for testing bandwidth |
||||
const size_t trials = parser.get<size_t>("trials"); |
||||
const size_t start_measurement = parser.get<size_t>("start"); |
||||
const size_t end_measurement = parser.get<size_t>("end"); |
||||
const size_t stride_between_measurements = parser.get<size_t>("stride"); |
||||
const std::string mode = parser.get<std::string>("mode"); |
||||
const std::string memory_cmd = parser.get<std::string>("memory"); |
||||
const std::vector<std::string> devices_cmd = parser.get<std::vector<std::string>>("device"); |
||||
const std::vector<std::string> memcpy_cmd = parser.get<std::vector<std::string>>("memcpy"); |
||||
|
||||
// Set the mode of bandwidth test: RANGED or SHMOO |
||||
TestMode mode_of_test; |
||||
|
||||
if(mode == "range") |
||||
{ |
||||
mode_of_test = TestMode::RANGED; |
||||
} |
||||
else if(mode == "shmoo") |
||||
{ |
||||
mode_of_test = TestMode::SHMOO; |
||||
} |
||||
else |
||||
{ |
||||
std::cerr << "Invalid mode " << mode << "! \n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
// Set the memory host allocation type: PAGED or PINNED |
||||
MemoryMode memory_allocation; |
||||
if(memory_cmd == "pageable") |
||||
{ |
||||
memory_allocation = MemoryMode::PAGED; |
||||
} |
||||
else if(memory_cmd == "pinned") |
||||
{ |
||||
memory_allocation = MemoryMode::PINNED; |
||||
} |
||||
else |
||||
{ |
||||
std::cerr << "Invalid memory allocation " << memory_cmd << "! \n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
// Store device ids |
||||
std::vector<int> devices; |
||||
if(std::find(devices_cmd.begin(), devices_cmd.end(), "all") != devices_cmd.end()) |
||||
{ |
||||
devices = std::vector<int>(number_of_devices); |
||||
|
||||
// Initialize the default device ids |
||||
std::iota(devices.begin(), devices.end(), 0); |
||||
} |
||||
else |
||||
{ |
||||
for(const std::string& device : devices_cmd) |
||||
{ |
||||
int device_id; |
||||
if(!parse_int_string(device, device_id)) |
||||
{ |
||||
std::cerr << "Invalid device ID " << device << "!\n"; |
||||
exit(error_exit_code); |
||||
} |
||||
|
||||
if(device_id < 0 || device_id >= number_of_devices) |
||||
{ |
||||
std::cerr << "Invalid device id " << device << "!\n" |
||||
<< "Device does not exist\n"; |
||||
exit(error_exit_code); |
||||
} |
||||
devices.emplace_back(device_id); |
||||
} |
||||
} |
||||
|
||||
std::cout << "Devices: " << format_range(devices.begin(), devices.end()) << "\n"; |
||||
|
||||
// Set hipMemcpyKind |
||||
std::map<hipMemcpyKind, std::string> memcpy_kinds; |
||||
if(std::find(memcpy_cmd.begin(), memcpy_cmd.end(), "all") != memcpy_cmd.end()) |
||||
{ |
||||
memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"}); |
||||
memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"}); |
||||
memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"}); |
||||
} |
||||
else |
||||
{ |
||||
for(std::string memcpy : memcpy_cmd) |
||||
{ |
||||
if(memcpy == "htod") |
||||
{ |
||||
memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"}); |
||||
} |
||||
else if(memcpy == "dtoh") |
||||
{ |
||||
memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"}); |
||||
} |
||||
else if(memcpy == "dtod") |
||||
{ |
||||
memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"}); |
||||
} |
||||
else |
||||
{ |
||||
std::cerr << "Invalid memcpy!" |
||||
<< "\n"; |
||||
exit(error_exit_code); |
||||
} |
||||
} |
||||
} |
||||
|
||||
std::vector<unsigned long> memory_copy_measurement_sizes; |
||||
if(mode_of_test == TestMode::RANGED) |
||||
{ |
||||
memory_copy_measurement_sizes |
||||
= generate_measurement_sizes_range(start_measurement, |
||||
end_measurement, |
||||
stride_between_measurements); |
||||
} |
||||
else |
||||
{ |
||||
memory_copy_measurement_sizes = generate_measurement_sizes_shmoo(); |
||||
} |
||||
|
||||
std::cout << "Measurement Sizes: " |
||||
<< format_range(memory_copy_measurement_sizes.begin(), |
||||
memory_copy_measurement_sizes.end()) |
||||
<< "\n\n"; |
||||
|
||||
// Run the bandwidth tests on devices |
||||
for(auto device : devices) |
||||
{ |
||||
hipDeviceProp_t devProp; |
||||
HIP_CHECK(hipSetDevice(device)); |
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, device)); |
||||
|
||||
for(auto memcpy_kind : memcpy_kinds) |
||||
{ |
||||
std::string print_text; |
||||
if(memory_allocation == MemoryMode::PAGED) |
||||
{ |
||||
print_text = "Paged Bandwidth "; |
||||
} |
||||
else if(memory_allocation == MemoryMode::PINNED) |
||||
{ |
||||
print_text = "Pinned Bandwidth "; |
||||
} |
||||
if(memcpy_kind.first == hipMemcpyDeviceToDevice) |
||||
{ |
||||
print_text = "Bandwidth "; |
||||
} |
||||
|
||||
std::vector<double> bandwidth_measurements; |
||||
if(memcpy_kind.first == hipMemcpyDeviceToDevice) |
||||
{ |
||||
bandwidth_measurements |
||||
= run_bandwidth_device_device(memory_copy_measurement_sizes, device, trials); |
||||
} |
||||
else |
||||
{ |
||||
bandwidth_measurements = run_bandwidth_host_device(memory_copy_measurement_sizes, |
||||
device, |
||||
memcpy_kind.first, |
||||
memory_allocation, |
||||
trials); |
||||
} |
||||
std::cout << "\nDevice ID [" << device << "] Device Name [" << devProp.name |
||||
<< "]: " << print_text << memcpy_kind.second << " (GB/s): " |
||||
<< format_range(bandwidth_measurements.begin(), bandwidth_measurements.end()) |
||||
<< "\n\n"; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,4 @@
@@ -0,0 +1,4 @@
|
||||
hip_llvm_ir_to_executable |
||||
*.bc |
||||
*.o |
||||
*.hipfb |
@ -0,0 +1,174 @@
@@ -0,0 +1,174 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_llvm_ir_to_executable) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
|
||||
# Only supported on HIP (not CUDA) |
||||
if(NOT "${GPU_RUNTIME}" STREQUAL "HIP") |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
if (NOT DEFINED CMAKE_HIP_ARCHITECTURES) |
||||
set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for") |
||||
else() |
||||
set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for") |
||||
endif() |
||||
|
||||
if(GPU_ARCHITECTURES STREQUAL "all") |
||||
set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE) |
||||
endif() |
||||
|
||||
# Remove duplicates |
||||
list(REMOVE_DUPLICATES GPU_ARCHITECTURES) |
||||
message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}") |
||||
|
||||
set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only") |
||||
|
||||
if (WIN32) |
||||
set(OBJ_TYPE obj) |
||||
set(NULDEV NUL) |
||||
set(HOST_TARGET x86_64-pc-windows-msvc) |
||||
set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin) |
||||
else() |
||||
set(OBJ_TYPE o) |
||||
set(NULDEV /dev/null) |
||||
set(HOST_TARGET x86_64-unknown-linux) |
||||
set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin) |
||||
endif() |
||||
|
||||
# Assemble the device assemblies to object files using the HIP compiler. |
||||
# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file |
||||
# for the right GPU. |
||||
foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) |
||||
message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
add_custom_command( |
||||
OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE} |
||||
${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll |
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll |
||||
VERBATIM) |
||||
endforeach() |
||||
|
||||
# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool. |
||||
find_program( |
||||
OFFLOAD_BUNDLER_COMMAND clang-offload-bundler |
||||
PATH_SUFFIXES bin |
||||
PATHS |
||||
${ROCM_ROOT}/llvm |
||||
${CMAKE_INSTALL_PREFIX}/llvm |
||||
REQUIRED) |
||||
|
||||
if(OFFLOAD_BUNDLER_COMMAND) |
||||
message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}") |
||||
else() |
||||
message(FATAL_ERROR "clang-offload-bundler not found") |
||||
endif() |
||||
|
||||
# Generate object bundle. |
||||
# The invocation to generate is |
||||
# clang-offload-bundler -targets=<targets> -input=<input target #1> -inputs=<input target #2> ... -output=<output> |
||||
# Note that the host target must be the first target present here, and it should have an empty input associated to it. |
||||
|
||||
# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},... |
||||
set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}") |
||||
# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ... |
||||
set(BUNDLE_INPUTS "-input=${NULDEV}") |
||||
# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} |
||||
set(BUNDLE_OBJECTS "") |
||||
foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES}) |
||||
set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}") |
||||
list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}") |
||||
endforeach() |
||||
|
||||
# Invoke clang-offload-bundler to generate an offload bundle. |
||||
set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb") |
||||
add_custom_command( |
||||
OUTPUT "${BUNDLE}" |
||||
COMMAND |
||||
"${OFFLOAD_BUNDLER_COMMAND}" |
||||
-type=o |
||||
-bundle-align=4096 |
||||
"${BUNDLE_TARGETS}" |
||||
${BUNDLE_INPUTS} |
||||
"-output=${BUNDLE}" |
||||
DEPENDS ${BUNDLE_OBJECTS} |
||||
VERBATIM) |
||||
|
||||
# Create the device binary by assembling the template that includes |
||||
# the offload bundle that was just generated using an .incbin directive. |
||||
# This needs an assembler. |
||||
find_program( |
||||
LLVM_MC_COMMAND llvm-mc |
||||
PATH_SUFFIXES bin |
||||
PATHS |
||||
${ROCM_ROOT}/llvm |
||||
${CMAKE_INSTALL_PREFIX}/llvm) |
||||
|
||||
if(LLVM_MC_COMMAND) |
||||
message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}") |
||||
else() |
||||
message(FATAL_ERROR "llvm-mc not found") |
||||
endif() |
||||
|
||||
# Invoke llvm-mc to generate an object file containing the offload bundle. |
||||
set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}") |
||||
add_custom_command( |
||||
OUTPUT "${DEVICE_OBJECT}" |
||||
COMMAND |
||||
"${LLVM_MC_COMMAND}" |
||||
-triple "${HOST_TARGET}" |
||||
"${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}" |
||||
-o "${DEVICE_OBJECT}" |
||||
--filetype=obj |
||||
DEPENDS "${BUNDLE}" |
||||
VERBATIM) |
||||
|
||||
# Finally, create the executable. |
||||
add_executable( |
||||
${example_name} |
||||
main.hip |
||||
${DEVICE_OBJECT}) |
||||
|
||||
# Make example runnable using ctest |
||||
add_test(${example_name} ${example_name}) |
||||
|
||||
set(include_dirs "../../Common") |
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,90 @@
@@ -0,0 +1,90 @@
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_llvm_ir_to_executable |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME ?= HIP |
||||
|
||||
ifneq ($(GPU_RUNTIME), HIP) |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.) |
||||
endif |
||||
|
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
CLANG ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang |
||||
LLVM_MC ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc |
||||
CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
# Compile for these GPU architectures
|
||||
HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030 |
||||
|
||||
# If white-space is given as a literal the `subst` cannot recognize it.
|
||||
# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS
|
||||
# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER
|
||||
# in the targets option. The targets option needs to be a single string with no spaces.
|
||||
empty = |
||||
space = $(empty) $(empty) |
||||
comma = , |
||||
|
||||
GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES)) |
||||
GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%)) |
||||
|
||||
all: $(EXAMPLE) |
||||
|
||||
$(EXAMPLE): main.o main_device.o |
||||
$(HIPCXX) -o $@ $^ |
||||
|
||||
main_device.o: hip_obj_gen.mcin offload_bundle.hipfb |
||||
$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj |
||||
|
||||
offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o) |
||||
$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \
|
||||
-targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \
|
||||
-input=/dev/null \
|
||||
$(^:%=-input=%) \
|
||||
-output=$@ |
||||
|
||||
main.o: main.hip |
||||
$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $< |
||||
|
||||
main_%.o: main_%.ll |
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $< |
||||
|
||||
clean: |
||||
rm -f \
|
||||
main_device*.o \
|
||||
main_*.bc \
|
||||
offload_bundle.hipfb \
|
||||
main_device.o \
|
||||
main.o \
|
||||
$(EXAMPLE) |
||||
|
||||
.PHONY: clean $(EXAMPLE) |
@ -0,0 +1,117 @@
@@ -0,0 +1,117 @@
|
||||
# HIP-Basic LLVM-IR to Executable Example |
||||
|
||||
## Description |
||||
This example shows how to manually compile and link a HIP application from device LLVM IR. Pre-generated LLVM-IR files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable. |
||||
|
||||
LLVM IR is the intermediary language used by the LLVM compiler, which hipcc is built on. Building HIP executables from LLVM IR can be useful for example to experiment with specific LLVM instructions, or can help debugging miscompilations. |
||||
|
||||
### Building |
||||
|
||||
- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma. |
||||
```shell |
||||
make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" |
||||
``` |
||||
- Build with CMake: |
||||
```shell |
||||
cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" |
||||
cmake --build build |
||||
``` |
||||
On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"` |
||||
|
||||
## Generating device LLVM IR |
||||
In this example, a HIP executable is compiled from device LLVM IR code. LLVM IR can be written completely manually, but in this example they are generated from `main.hip`, using the following commands: |
||||
```shell |
||||
$ROCM_INSTALL_DIR/bin/hipcc -cuda-device-only -c -emit-llvm ./main.hip --offload-arch=<arch> -o main_<arch>.bc -I ../../Common |
||||
$ROCM_INSTALL_DIR/bin/llvm-dis main_<arch>.bc -o main_<arch>.ll |
||||
``` |
||||
Where `<arch>` is the architecture to generate the LLVM IR for. Note that the `--cuda-device-only` flag is required to instruct `hipcc` to only generate LLVM IR for the device part of the computation, and `-c` is required to prevent the compiler from linking the ouputs into an executable. In the case of this example, the LLVM IR files where generated using architectures `gfx803`, `gfx900`, `gfx906`, `gfx908`, `gfx90a`, `gfx1030`. The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=<arch>:<feature>+` to enable it or `--offload-arch=<arch>:<feature>-` to disable it. Multiple features may be present, separated by colons. |
||||
|
||||
The first of these two commands generates a _bitcode_ module: this is a binary encoded version of LLVM IR. The second command, using `llvm-dis` disassembles the bitcode module into textual LLVM IR. |
||||
|
||||
## Build Process |
||||
A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device LLVM IR and host HIP code according to the following process: |
||||
|
||||
1. The `main.hip` file is compiled to an object file with `hipcc` that only contains host code by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead. |
||||
```shell |
||||
$ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip |
||||
``` |
||||
|
||||
2. Each LLVM IR file is assembled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`: |
||||
|
||||
```shell |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.ll -o main_gfx1030.o |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=<arch> main_<arch>.ll -o main_<arch>.o |
||||
... |
||||
``` |
||||
|
||||
3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `<offload-kind>-<target>-<arch>`. For HIP device code, `<offload-kind>` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `<offload-kind>` `host`. The order of targets and inputs must match. `<target>` is an LLVM target triple, which is specified as `<isa>-<vendor>-<os>-<abi>`. `<abi>` is left empty for AMD targets. |
||||
|
||||
```shell |
||||
$ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \ |
||||
-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \ |
||||
-input=/dev/null \ |
||||
-input=main_gfx1030.o -input=... \ |
||||
-output=offload_bundle.hipfb |
||||
``` |
||||
|
||||
Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling the source to LLVM bitcode. |
||||
|
||||
4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive: |
||||
```nasm |
||||
.type __hip_fatbin,@object |
||||
; Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"a",@progbits |
||||
; Make the symbol that addresses the binary public |
||||
.globl __hip_fatbin |
||||
; Give the bundle the required alignment |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
; Include the binary |
||||
.incbin "offload_bundle.hipfb" |
||||
``` |
||||
This file can then be assembled using `llvm-mc` as follows: |
||||
```shell |
||||
$ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple <host target> -o main_device.o hip_obj_gen.mcin --filetype=obj |
||||
``` |
||||
|
||||
5. Finally, using the system linker, `hipcc`, or `clang`, the host object and device objects are linked into an executable: |
||||
```shell |
||||
<ROCM_PATH>/hip/bin/hipcc -o hip_llvm_ir_to_executable main.o main_device.o |
||||
``` |
||||
|
||||
### Visual Studio 2019 |
||||
The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools: |
||||
- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`: |
||||
``` |
||||
Additional Options: --cuda-host-only |
||||
``` |
||||
- Each device LLVM IR .ll file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section: |
||||
``` |
||||
Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a |
||||
Description: Compiling Device Assembly %(Identity) |
||||
Output: $(IntDir)%(FileName).o |
||||
Execute Before: ClCompile |
||||
``` |
||||
- Steps 3 and 4 are implemented using a custom build step: |
||||
``` |
||||
Command Line: |
||||
"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
Description: Generating Device Offload Object |
||||
Outputs: $(IntDIr)main_device.obj |
||||
Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs) |
||||
Execute Before: ClCompile |
||||
``` |
||||
- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`: |
||||
``` |
||||
Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies) |
||||
``` |
||||
|
||||
## Used API surface |
||||
### HIP runtime |
||||
- `hipFree` |
||||
- `hipGetDeviceProperties` |
||||
- `hipGetLastError` |
||||
- `hipLaunchKernelGGL` |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
@ -0,0 +1,21 @@
@@ -0,0 +1,21 @@
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||||
# See https://llvm.org/LICENSE.txt for license information. |
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||||
|
||||
# HIP Object Generator |
||||
# Use this generator to create a host bundled object file |
||||
# with the input of an offload bundled fat binary. |
||||
# |
||||
# Input: Bundled Object file .hipfb file |
||||
# Output: Host Bundled Object File .o |
||||
|
||||
.type __hip_fatbin,@object |
||||
# Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"a",@progbits |
||||
# Make the symbol that addresses the binary public. |
||||
.globl __hip_fatbin |
||||
# Give the bundle the required alignment of 4096 (2 ^ 12). |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
# Include the offload bundle. |
||||
.incbin "offload_bundle.hipfb" |
@ -0,0 +1,20 @@
@@ -0,0 +1,20 @@
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||||
# See https://llvm.org/LICENSE.txt for license information. |
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||||
|
||||
# HIP Object Generator |
||||
# Use this generator to create a host bundled object file |
||||
# with the input of an offload bundled fat binary. |
||||
# |
||||
# Input: Bundled Object file .hipfb file |
||||
# Output: Host Bundled Object File .o |
||||
|
||||
# Tell the assembler to place the offload bundle in the appropriate section. |
||||
.section .hip_fatbin,"dw" |
||||
# Make the symbol that addresses the binary public. |
||||
.globl __hip_fatbin |
||||
# Give the bundle the required alignment of 4096 (2 ^ 12). |
||||
.p2align 12 |
||||
__hip_fatbin: |
||||
# Include the offload bundle. |
||||
.incbin "offload_bundle.hipfb" |
@ -0,0 +1,183 @@
@@ -0,0 +1,183 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip"> |
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions> |
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions> |
||||
</ClCompile> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<CustomBuild Include="hip_obj_gen_win.mcin"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command> |
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message> |
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command> |
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message> |
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx1030.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx803.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx803</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx900.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx906.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx908.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command> |
||||
</CustomBuild> |
||||
<CustomBuild Include="main_gfx90a.ll"> |
||||
<FileType>Document</FileType> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command> |
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command> |
||||
</CustomBuild> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{dbb8dfe9-cb1b-473c-937c-2a8120e0d819}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>llvm_ir_to_executable_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
<CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
<CustomBuild> |
||||
<Message>Compiling Device LLVM IR %(Identity)</Message> |
||||
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command> |
||||
<Outputs>$(IntDir)%(FileName).o</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuildStep> |
||||
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Message>Generating Device Offload Object</Message> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Outputs>$(IntDIr)main_device.obj</Outputs> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs> |
||||
</CustomBuildStep> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
<CustomBuild> |
||||
<Message>Compiling Device LLVM IR %(Identity)</Message> |
||||
<Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command> |
||||
<Outputs>$(IntDir)%(FileName).o</Outputs> |
||||
</CustomBuild> |
||||
<CustomBuildStep> |
||||
<Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=NUL "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb" |
||||
cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Message>Generating Device Offload Object</Message> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Outputs>$(IntDIr)main_device.obj</Outputs> |
||||
</CustomBuildStep> |
||||
<CustomBuildStep> |
||||
<Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs> |
||||
</CustomBuildStep> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -0,0 +1,118 @@
@@ -0,0 +1,118 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
|
||||
#include <cstdlib> |
||||
#include <iostream> |
||||
#include <vector> |
||||
|
||||
/// \brief Device function to square each element |
||||
/// in the array `in` and write to array `out`. |
||||
template<typename T> |
||||
__global__ void vector_square_kernel(T* out, const T* in, const long long size) |
||||
{ |
||||
// Get the unique global thread ID |
||||
const size_t offset = blockIdx.x * blockDim.x + threadIdx.x; |
||||
// Each thread hops stride amount of elements to find the next |
||||
// element to square |
||||
const size_t stride = blockDim.x * gridDim.x; |
||||
|
||||
for(size_t i = offset; i < size; i += stride) |
||||
{ |
||||
out[i] = in[i] * in[i]; |
||||
} |
||||
} |
||||
|
||||
int main() |
||||
{ |
||||
// Set the problem size |
||||
constexpr size_t size = 1000000; |
||||
constexpr size_t size_in_bytes = size * sizeof(float); |
||||
|
||||
hipDeviceProp_t props; |
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/)); |
||||
std::cout << "info: running on device " << props.name << "\n"; |
||||
|
||||
std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) " |
||||
<< "\n"; |
||||
|
||||
// Declare the host side arrays |
||||
std::vector<float> h_in(size); |
||||
std::vector<float> h_out(size); |
||||
|
||||
// Initialize the host size input |
||||
for(size_t i = 0; i < size; i++) |
||||
{ |
||||
h_in[i] = 1.618f + i; |
||||
} |
||||
|
||||
// Declare the device side arrays |
||||
float *d_in, *d_out; |
||||
std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n"; |
||||
// Allocate the device side memory |
||||
HIP_CHECK(hipMalloc(&d_in, size_in_bytes)); |
||||
HIP_CHECK(hipMalloc(&d_out, size_in_bytes)); |
||||
|
||||
std::cout << "info: copy Host2Device\n"; |
||||
|
||||
// Copy the input from host to the GPU device |
||||
HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Set the number of blocks per kernel grid. |
||||
constexpr unsigned int grid_size = 512; |
||||
// Set the number of threads per kernel block. |
||||
constexpr unsigned int threads_per_block = 256; |
||||
|
||||
std::cout << "info: launch 'vector_square_kernel' kernel\n"; |
||||
hipLaunchKernelGGL(vector_square_kernel, |
||||
grid_size, |
||||
threads_per_block, |
||||
0, |
||||
hipStreamDefault, |
||||
d_out, |
||||
d_in, |
||||
size); |
||||
|
||||
// Check that the kernel invocation was successful. |
||||
HIP_CHECK(hipGetLastError()); |
||||
|
||||
std::cout << "info: copy Device2Host\n"; |
||||
HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost)); |
||||
|
||||
HIP_CHECK(hipFree(d_in)); |
||||
HIP_CHECK(hipFree(d_out)); |
||||
|
||||
std::cout << "info: check result\n"; |
||||
for(size_t i = 0; i < size; i++) |
||||
{ |
||||
if(h_out[i] != h_in[i] * h_in[i]) |
||||
{ |
||||
std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i] |
||||
<< ", expected: " << h_in[i] * h_in[i] << '\n'; |
||||
exit(error_exit_code); |
||||
} |
||||
} |
||||
std::cout << "PASSED!\n"; |
||||
} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx1030.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx803.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx803" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx900.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx906.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx908.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -0,0 +1,97 @@
@@ -0,0 +1,97 @@
|
||||
; ModuleID = 'main_gfx90a.bc' |
||||
source_filename = "./main.hip" |
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" |
||||
target triple = "amdgcn-amd-amdhsa" |
||||
|
||||
%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 } |
||||
%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 } |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any |
||||
|
||||
$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any |
||||
|
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1 |
||||
@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1 |
||||
@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata" |
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind |
||||
define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 { |
||||
%4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2 |
||||
%5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 |
||||
%6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12 |
||||
%7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)* |
||||
%8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4 |
||||
%9 = getelementptr i8, i8 addrspace(4)* %5, i64 4 |
||||
%10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)* |
||||
%11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14 |
||||
%12 = zext i16 %11 to i32 |
||||
%13 = mul i32 %4, %12 |
||||
%14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15 |
||||
%15 = add i32 %13, %14 |
||||
%16 = zext i32 %15 to i64 |
||||
%17 = zext i32 %8 to i64 |
||||
%18 = icmp ult i64 %16, %2 |
||||
br i1 %18, label %20, label %19 |
||||
|
||||
19: ; preds = %20, %3 |
||||
ret void |
||||
|
||||
20: ; preds = %3, %20 |
||||
%21 = phi i64 [ %26, %20 ], [ %16, %3 ] |
||||
%22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21 |
||||
%23 = load float, float addrspace(1)* %22, align 4, !tbaa !16 |
||||
%24 = fmul contract float %23, %23 |
||||
%25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21 |
||||
store float %24, float addrspace(1)* %25, align 4, !tbaa !16 |
||||
%26 = add i64 %21, %17 |
||||
%27 = icmp ult i64 %26, %2 |
||||
br i1 %27, label %20, label %19, !llvm.loop !20 |
||||
} |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1 |
||||
|
||||
; Function Attrs: nounwind readnone speculatable willreturn |
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1 |
||||
|
||||
attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" } |
||||
attributes #1 = { nounwind readnone speculatable willreturn } |
||||
attributes #2 = { nounwind } |
||||
|
||||
!llvm.module.flags = !{!0, !1} |
||||
!opencl.ocl.version = !{!2} |
||||
!llvm.ident = !{!3} |
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4} |
||||
!1 = !{i32 7, !"PIC Level", i32 1} |
||||
!2 = !{i32 2, i32 0} |
||||
!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"} |
||||
!4 = !{!5, !9, i64 12} |
||||
!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56} |
||||
!6 = !{!"short", !7, i64 0} |
||||
!7 = !{!"omnipotent char", !8, i64 0} |
||||
!8 = !{!"Simple C/C++ TBAA"} |
||||
!9 = !{!"int", !7, i64 0} |
||||
!10 = !{!"long", !7, i64 0} |
||||
!11 = !{!"any pointer", !7, i64 0} |
||||
!12 = !{!"hsa_signal_s", !10, i64 0} |
||||
!13 = !{i16 1, i16 1025} |
||||
!14 = !{} |
||||
!15 = !{i32 0, i32 1024} |
||||
!16 = !{!17, !17, i64 0} |
||||
!17 = !{!"float", !18, i64 0} |
||||
!18 = !{!"omnipotent char", !19, i64 0} |
||||
!19 = !{!"Simple C++ TBAA"} |
||||
!20 = distinct !{!20, !21} |
||||
!21 = !{!"llvm.loop.mustprogress"} |
@ -1,86 +0,0 @@
@@ -1,86 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#ifndef HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP |
||||
#define HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <charconv> |
||||
#include <iostream> |
||||
#include <string> |
||||
#include <string_view> |
||||
|
||||
#include <cstdlib> |
||||
|
||||
/// \brief Tries to read the matrix dimensions from the command line.
|
||||
/// If no command line arguments were provided, the passed values are not modified.
|
||||
/// Otherwise, the number of arguments must be 3: <A rows> <A columns> <B columns>
|
||||
/// (B rows will be equal to A columns).
|
||||
/// If the number of arguments is different, or the arguments cannot be parsed to
|
||||
/// unsigned ints, an error message is printed and the program exits with a non-zero code.
|
||||
inline void matrix_dimensions_from_command_line(const int argc, |
||||
const char* argv[], |
||||
unsigned int& a_rows, |
||||
unsigned int& a_cols, |
||||
unsigned int& b_cols, |
||||
const unsigned int block_size) |
||||
{ |
||||
const auto print_usage_and_exit = [=]() |
||||
{ |
||||
const std::string usage_message |
||||
= "Calculates matrix product A*B.\n" |
||||
"Usage: hip_matrix_multiplication [<A rows> <A columns> <B columns>].\n" |
||||
"Matrix dimensions must be positive multiples of block_size (" |
||||
+ std::to_string(block_size) + ")"; |
||||
std::cout << usage_message << std::endl; |
||||
exit(error_exit_code); |
||||
}; |
||||
const auto get_argument_by_index = [=](const unsigned int index) -> unsigned int |
||||
{ |
||||
const std::string_view argument_text(argv[index]); |
||||
|
||||
unsigned int converted_value; |
||||
const auto conversion_result = std::from_chars(argument_text.data(), |
||||
argument_text.data() + argument_text.size(), |
||||
converted_value); |
||||
if(conversion_result.ec != std::errc{} || (converted_value % block_size) != 0) |
||||
{ |
||||
print_usage_and_exit(); |
||||
} |
||||
return converted_value; |
||||
}; |
||||
|
||||
if(argc == 1) |
||||
{ |
||||
return; |
||||
} |
||||
if(argc != 4) |
||||
{ |
||||
print_usage_and_exit(); |
||||
} |
||||
a_rows = get_argument_by_index(1); |
||||
a_cols = get_argument_by_index(2); |
||||
b_cols = get_argument_by_index(3); |
||||
} |
||||
|
||||
#endif // HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP
|
@ -1,97 +1,101 @@
@@ -1,97 +1,101 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>matrix_multiplication_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>matrix_multiplication_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
||||
|
@ -1,95 +1,99 @@
@@ -1,95 +1,99 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>occupancy_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>occupancy_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
||||
|
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
hip_runtime_compilation |
@ -0,0 +1,70 @@
@@ -0,0 +1,70 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_runtime_compilation) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
set(GPU_RUNTIMES "HIP" "CUDA") |
||||
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) |
||||
|
||||
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
add_executable(${example_name} main.hip) |
||||
# Make example runnable using ctest |
||||
add_test(${example_name} ${example_name}) |
||||
|
||||
set(link_libs "") |
||||
set(include_dirs "../../Common") |
||||
|
||||
if(GPU_RUNTIME STREQUAL "HIP") |
||||
# Link hiprtc library |
||||
find_library(HIPRTC_LIB hiprtc REQUIRED) |
||||
list(APPEND link_libs "${HIPRTC_LIB}") |
||||
endif() |
||||
|
||||
if(GPU_RUNTIME STREQUAL "CUDA") |
||||
# Include the HIP header directory. |
||||
list(APPEND include_dirs "${ROCM_ROOT}/include") |
||||
# In this example we also need to link nvrtc CUDA library |
||||
find_package("CUDAToolkit" REQUIRED) |
||||
list(APPEND link_libs "CUDA::nvrtc") |
||||
endif() |
||||
|
||||
target_link_libraries(${example_name} ${link_libs}) |
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,55 @@
@@ -0,0 +1,55 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_runtime_compilation |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME := HIP |
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
ifeq ($(GPU_RUNTIME), CUDA) |
||||
CXXFLAGS += -x cu |
||||
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) |
||||
LDLIBS += -l nvrtc |
||||
else ifeq ($(GPU_RUNTIME), HIP) |
||||
else |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) |
||||
endif |
||||
|
||||
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp |
||||
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ |
||||
|
||||
clean: |
||||
$(RM) $(EXAMPLE) |
||||
|
||||
.PHONY: clean |
@ -0,0 +1,91 @@
@@ -0,0 +1,91 @@
|
||||
# HIP-Basic Runtime Compilation Example |
||||
|
||||
## Description |
||||
|
||||
Runtime compilation allows compiling fragments of source code to machine code at runtime, when a program is already running, rather than compiling the code ahead of time. HIP supports runtime compilation through hipRTC, which can be used to compile HIP device code at runtime. This permits specific optimizations that depend on values determined at runtime. Therefore, usage of hipRTC provides the possibility of obtaining optimizations and performance improvements over offline compilation. |
||||
|
||||
This example showcases how to make use of hipRTC to compile in runtime a kernel and launch it on a device. This kernel is a simple SAXPY, i.e. a single-precision operation $y_i=ax_i+y_i$. |
||||
|
||||
### Application flow |
||||
The diagram below summarizes the runtime compilation part of the example. |
||||
1. A number of variables are declared and defined to configure the program which will be compiled in runtime. |
||||
2. The program is created using the above variables as parameters, along with the SAXPY kernel in string form. |
||||
3. The properties of the first device (GPU) available are consulted to set the device architecture as (the only) compile option. |
||||
4. The program is compiled using the previously mentioned compile options. |
||||
5. If exists, the log generated during the compile process is printed to the standard output. |
||||
6. The binary compiled from the program is stored as a vector of characters and the program object is destroyed. |
||||
7. Begin the preparation for the launch of the kernel on the device. A number of constants are defined to control the problem details and the kernel launch parameters. |
||||
8. The two input vectors, $x$ and $y$, are instantiated in host memory and filled with the increasing sequences $1, 2, 3, 4, ...$ and $2, 4, 6, 8, ...$, respectively. |
||||
9. The necessary amount of device (GPU) memory is allocated and the elements of the input vectors are copied to the device memory. |
||||
10. A HIP module corresponding to the compiled binary is loaded into the current context and the SAXPY kernel is extracted from it into a HIP function object. |
||||
11. The kernel launch configuration options and its arguments are declared and defined. |
||||
12. A trace message is printed to the standard output. |
||||
13. The GPU kernel is then launched with the above mentioned options along with the constants defined previously. |
||||
14. The results are copied back to host vector $y$. |
||||
15. The previously allocated device memory is freed. |
||||
16. The module is unloaded from the current context and freed. |
||||
17. The first few elements of the result vector $y$ are printed to the standard output. |
||||
|
||||
 |
||||
## Key APIs and Concepts |
||||
- `hipGetDeviceProperties` extracts the properties of the desired device. In this example it is used to get the GPU architecture. |
||||
- `hipModuleGetFunction` extracts a handle for a function with a certain name from a given module. Note that if no function with that name is present in the module this method will return an error. |
||||
- `hipModuleLaunchKernel` queues the launch of the provided kernel on the device. This function normally presents an asynchronous behaviour (see `HIP_LAUNCH_BLOCKING`), i.e. a call to it may return before the device finishes the execution of the kernel. Its parameters are the following: |
||||
- The kernel to be launched. |
||||
- Number of blocks in the dimension X of kernel grid, i.e. the X component of grid size. |
||||
- Number of blocks in the dimension Y of kernel grid, i.e. the Y component of grid size. |
||||
- Number of blocks in the dimension Z of kernel grid, i.e. the Z component of grid size. |
||||
- Number of threads in the dimension X of each block, i.e. the X component of block size. |
||||
- Number of threads in the dimension Y of each block, i.e. the Y component of block size. |
||||
- Number of threads in the dimension Z of each block, i.e. the Z component of block size. |
||||
- Amount of dynamic shared memory that will be available to each workgroup, in bytes. Not used in this example. |
||||
- The device stream, on which the kernel should be dispatched. If 0 (or NULL), the NULL stream will be used. In this example the latter is used. |
||||
- Pointer to the arguments needed by the kernel. Note that this parameter is not yet implemented, and thus the _extra_ parameter (the last one described in this list) should be used to pass arguments to the kernel. |
||||
- Pointer to all extra arguments passed to the kernel. They must be in the memory layout and alignment expected by the kernel. The list of arguments must end with `HIP_LAUNCH_PARAM_END`. |
||||
- `hipModuleLoadData` builds a module from a code (compiled binary) object residing in host memory and loads it into the current context. Note that in this example this function is called right after `hipMalloc`. This is due to the fact that, on CUDA, `hipModuleLoadData` will fail if it is not called after some runtime API call is done (as it will implicitly intialize a current context) or if there is not an explicit creation of a (current) context. |
||||
- `hipModuleUnload` unloads the specified module from the current context and frees it. |
||||
- `hiprtcCompileProgram` compiles the given program in runtime. Some compilation options may be passed as parameters to this function. In this example, the GPU architeture is the only compilation option. |
||||
- `hiprtcCreateProgram` instantiates a runtime compilation program from the given parameters. Those are the following: |
||||
- The runtime compilation program object that will be set with the new instance. |
||||
- A pointer to the program source code. |
||||
- A pointer to the program name. |
||||
- The number of headers to be included. |
||||
- An array of pointers to the headers names. |
||||
- An array of pointers to the names to be included in the source program. |
||||
|
||||
In this example the program is created including two header files to illustrate how to pass all of the above arguments to this function. |
||||
- `hiprtcDestroyProgram` destroys an instance of a given runtime compilation program object. |
||||
- `hiprtcGetProgramLog` extracts the char pointer to the log generated during the compilation of a given runtime compilation program. |
||||
- `hiprtcGetProgramLogSize` returns the compilation log size of a given runtime compilation program, measured as number of characters. |
||||
- `hiprtcGetCode` extracts the char pointer to the compilation binary in memory from a runtime compilation program object. This binary is needed to load the corresponding HIP module into the current context and extract from it the kernel(s) that will be executed on the GPU. |
||||
- `hiprtcGetCodeSize` returns the size of the binary compiled of a given runtime compilation program, measured as number of characters. |
||||
|
||||
## Demonstrated API Calls |
||||
|
||||
### HIP runtime |
||||
|
||||
#### Device symbols |
||||
- `threadIdx`, `blockIdx`, `blockDim` |
||||
|
||||
#### Host symbols |
||||
- `hipFree` |
||||
- `hipGetDeviceProperties` |
||||
- `hipGetLastError` |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
||||
- `hipMemcpyDeviceToHost` |
||||
- `hipMemcpyHostToDevice` |
||||
- `hipModuleGetFunction` |
||||
- `hipModuleLaunchKernel` |
||||
- `hipModuleLoadData` |
||||
- `hipModuleUnload` |
||||
- `hiprtcCompileProgram` |
||||
- `hiprtcCreateProgram` |
||||
- `hiprtcDestroyProgram` |
||||
- `hiprtcGetCode` |
||||
- `hiprtcGetCodeSize` |
||||
- `hiprtcGetProgramLog` |
||||
- `hiprtcGetProgramLogSize` |
||||
- `HIP_LAUNCH_PARAM_BUFFER_POINTER` |
||||
- `HIP_LAUNCH_PARAM_BUFFER_SIZE` |
||||
- `HIP_LAUNCH_PARAM_END` |
After Width: | Height: | Size: 22 KiB |
@ -0,0 +1,215 @@
@@ -0,0 +1,215 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
#include <hip/hiprtc.h> |
||||
|
||||
#include <algorithm> |
||||
#include <iostream> |
||||
#include <numeric> |
||||
#include <vector> |
||||
|
||||
// SAXPY kernel stored as a string |
||||
static constexpr auto saxpy_kernel{ |
||||
R"( |
||||
#include "test_header.h" |
||||
#include "test_header1.h" |
||||
extern "C" |
||||
__global__ void saxpy_kernel(const real a, const realptr d_x, realptr d_y, const unsigned int size) |
||||
{ |
||||
const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x; |
||||
if(global_idx < size) |
||||
{ |
||||
d_y[global_idx] = a * d_x[global_idx] + d_y[global_idx]; |
||||
} |
||||
} |
||||
)"}; |
||||
|
||||
int main() |
||||
{ |
||||
// Program to be compiled in runtime. |
||||
hiprtcProgram prog; |
||||
|
||||
// Vector containing example header names. |
||||
std::vector<const char*> header_names; |
||||
header_names.push_back("test_header.h"); |
||||
header_names.push_back("test_header1.h"); |
||||
|
||||
// Vector containing example names to be included in the program. |
||||
std::vector<const char*> header_sources; |
||||
header_sources.push_back("#ifndef HIPRTC_TEST_HEADER_H\n#define HIPRTC_TEST_HEADER_H\ntypedef " |
||||
"float real;\n#endif //HIPRTC_TEST_HEADER_H\n"); |
||||
header_sources.push_back( |
||||
"#ifndef HIPRTC_TEST_HEADER1_H\n#define HIPRTC_TEST_HEADER1_H\ntypedef float* " |
||||
"realptr;\n#endif //HIPRTC_TEST_HEADER1_H\n"); |
||||
|
||||
// Create program. |
||||
hiprtcCreateProgram(&prog, |
||||
saxpy_kernel, |
||||
"saxpy_kernel.cu", |
||||
header_sources.size(), |
||||
header_sources.data(), |
||||
header_names.data()); |
||||
|
||||
// Get device properties from the first device available. |
||||
hipDeviceProp_t props; |
||||
constexpr unsigned int device_id = 0; |
||||
HIP_CHECK(hipGetDeviceProperties(&props, device_id)); |
||||
|
||||
// Obtain architecture's name from device properties and initialize array of compile options. When in CUDA we omit this option. |
||||
std::string sarg |
||||
= (props.gcnArchName[0]) ? std::string("--gpu-architecture=") + props.gcnArchName : ""; |
||||
const char* options[] = {sarg.c_str()}; |
||||
const int num_options = !sarg.empty(); |
||||
|
||||
// Compile program in runtime. Parameters are the program, number of options and array with options. |
||||
const hiprtcResult compile_result{hiprtcCompileProgram(prog, num_options, options)}; |
||||
|
||||
// Get the size of the log (possibly) generated during the compilation. |
||||
size_t log_size; |
||||
hiprtcGetProgramLogSize(prog, &log_size); |
||||
|
||||
// If the compilation generated a log, print it. |
||||
if(log_size) |
||||
{ |
||||
std::string log(log_size, '\0'); |
||||
hiprtcGetProgramLog(prog, &log[0]); |
||||
std::cout << log << std::endl; |
||||
} |
||||
|
||||
// If the compilation failed, say so and exit. |
||||
if(compile_result != HIPRTC_SUCCESS) |
||||
{ |
||||
std::cout << "Error: compilation failed." << std::endl; |
||||
return EXIT_FAILURE; |
||||
} |
||||
|
||||
// Get the size (in number of characters) of the binary compiled from the program. |
||||
size_t code_size; |
||||
hiprtcGetCodeSize(prog, &code_size); |
||||
|
||||
// Store compiled binary as a vector of characters. |
||||
std::vector<char> code(code_size); |
||||
hiprtcGetCode(prog, code.data()); |
||||
|
||||
// Destroy program object. |
||||
hiprtcDestroyProgram(&prog); |
||||
|
||||
// Now we launch the kernel on the device. |
||||
|
||||
// Total number of float elements in each device vector. |
||||
constexpr unsigned int size = 4096; |
||||
|
||||
// Total number of bytes to allocate for each device vector. |
||||
constexpr size_t size_bytes = size * sizeof(float); |
||||
|
||||
// Number of threads per kernel block. |
||||
constexpr unsigned int block_size = 128; |
||||
|
||||
// Number of blocks per kernel grid, calculated as ceil(size/block_size). |
||||
constexpr unsigned int grid_size = (size + block_size - 1) / block_size; |
||||
|
||||
// Constant value 'a' to be used in the expression 'a*x+y'. |
||||
constexpr float a = 5.1f; |
||||
|
||||
// Allocate x vector in host and fill it with increasing sequence 1, 2, 3, 4, ... . |
||||
std::vector<float> x(size); |
||||
std::iota(x.begin(), x.end(), 1.f); |
||||
|
||||
// Allocate y vector in host and fill it with increasing sequence 2, 4, 6, 8, ... . |
||||
std::vector<float> y(x); |
||||
std::for_each(y.begin(), y.end(), [](float& f) { f = 2 * f; }); |
||||
|
||||
// Allocate vectors in device and copy from host to device memory. |
||||
float* d_x{}; |
||||
float* d_y{}; |
||||
HIP_CHECK(hipMalloc(&d_x, size_bytes)); |
||||
HIP_CHECK(hipMalloc(&d_y, size_bytes)); |
||||
HIP_CHECK(hipMemcpy(d_x, x.data(), size_bytes, hipMemcpyHostToDevice)); |
||||
HIP_CHECK(hipMemcpy(d_y, y.data(), size_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Load the HIP module corresponding to the compiled binary into the current context. |
||||
hipModule_t module; |
||||
HIP_CHECK(hipModuleLoadData(&module, code.data())); |
||||
|
||||
// Extract SAXPY kernel from module into a function object. |
||||
hipFunction_t kernel; |
||||
HIP_CHECK(hipModuleGetFunction(&kernel, module, "saxpy_kernel")); |
||||
|
||||
// Create and fill array with kernel arguments. |
||||
size_t offset = 0; |
||||
char args[256] = {}; |
||||
|
||||
*(reinterpret_cast<float*>(&args[offset])) = a; |
||||
offset += sizeof(a); |
||||
offset += 4; // aligning fix for CUDA executions |
||||
*(reinterpret_cast<float**>(&args[offset])) = d_x; |
||||
offset += sizeof(d_x); |
||||
*(reinterpret_cast<float**>(&args[offset])) = d_y; |
||||
offset += sizeof(d_y); |
||||
*(reinterpret_cast<unsigned int*>(&args[offset])) = size; |
||||
offset += sizeof(size); |
||||
|
||||
// Create array with kernel arguments and its size. |
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, |
||||
args, |
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, |
||||
&offset, |
||||
HIP_LAUNCH_PARAM_END}; |
||||
|
||||
std::cout << "Calculating y[i] = a * x[i] + y[i] over " << size << " elements." << std::endl; |
||||
|
||||
// Launch the kernel on the NULL stream and with the above configuration. |
||||
HIP_CHECK(hipModuleLaunchKernel(kernel, |
||||
grid_size, |
||||
1, |
||||
1, |
||||
block_size, |
||||
1, |
||||
1, |
||||
0, |
||||
nullptr, |
||||
nullptr, |
||||
(void**)&config)); |
||||
|
||||
// Check if the kernel launch was successful. |
||||
HIP_CHECK(hipGetLastError()) |
||||
|
||||
// Copy results from device to host. |
||||
HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost)); |
||||
|
||||
// Free device memory. |
||||
HIP_CHECK(hipFree(d_x)); |
||||
HIP_CHECK(hipFree(d_y)); |
||||
|
||||
// Unload module. |
||||
HIP_CHECK(hipModuleUnload(module)); |
||||
|
||||
// Print the first few elements of the results for validation. |
||||
constexpr size_t elements_to_print = 10; |
||||
std::cout << "First " << elements_to_print << " elements of the results: " |
||||
<< format_range(y.begin(), y.begin() + elements_to_print) << std::endl; |
||||
|
||||
return 0; |
||||
} |
@ -0,0 +1,101 @@
@@ -0,0 +1,101 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{E03790B7-B203-4504-BEF5-F4F061183642}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>runtime_compilation_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
hip_shared_memory |
@ -0,0 +1,59 @@
@@ -0,0 +1,59 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_shared_memory) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
set(GPU_RUNTIMES "HIP" "CUDA") |
||||
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) |
||||
|
||||
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
add_executable(${example_name} main.hip) |
||||
# Make example runnable using ctest |
||||
add_test(${example_name} ${example_name}) |
||||
|
||||
set(include_dirs "../../Common") |
||||
|
||||
# For examples targeting NVIDIA, include the HIP header directory. |
||||
if(GPU_RUNTIME STREQUAL "CUDA") |
||||
list(APPEND include_dirs "${ROCM_ROOT}/include") |
||||
endif() |
||||
|
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,54 @@
@@ -0,0 +1,54 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_shared_memory |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME := HIP |
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
ifeq ($(GPU_RUNTIME), CUDA) |
||||
CXXFLAGS += -x cu |
||||
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) |
||||
else ifeq ($(GPU_RUNTIME), HIP) |
||||
else |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) |
||||
endif |
||||
|
||||
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp |
||||
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ |
||||
|
||||
clean: |
||||
$(RM) $(EXAMPLE) |
||||
|
||||
.PHONY: clean |
@ -0,0 +1,47 @@
@@ -0,0 +1,47 @@
|
||||
# HIP-Basic Shared Memory Example |
||||
|
||||
## Description |
||||
The shared memory is an on-chip type of memory that is visible to all the threads within the same block, allowing them to communicate by writing and reading data from the same memory space. However, some synchronization among the threads of the block is needed to ensure that all of them have written before trying to access the data. |
||||
|
||||
When using the appropriate access pattern, this memory can provide much less latency than local or global memory (nearly as much as registers), making it a much better option in certain cases. If the size of the shared memory to be used is known at compile time, it can be explicitly specified and it is then known as static shared memory. |
||||
|
||||
This example implements a simple matrix transpose kernel to showcase how to use static shared memory. |
||||
|
||||
### Application flow |
||||
1. A number of constants are defined for the kernel launch parameters. |
||||
2. The input and output matrices are allocated and initialized in host memory. |
||||
3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device. |
||||
4. A trace message is printed to the standard output. |
||||
5. The GPU kernel is then launched with the previously defined arguments. |
||||
6. The transposed matrix is copied back to host memory. |
||||
7. All device memory is freed. |
||||
8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output. |
||||
|
||||
## Key APIs and Concepts |
||||
- `__shared__` is a variable declaration specifier necessary to allocate shared memory from the device. |
||||
- `__syncthreads` allows to synchronize all the threads within the same block. This synchronization barrier is used to ensure that every thread in a block have finished writing in shared memory before another threads in the block try to access that data. |
||||
- `hipMalloc` allocates host device memory in global memory, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others. |
||||
- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU). |
||||
- `hipGetLastError` gets the last error returned by any HIP runtime API call. |
||||
- `hipFree` deallocates device memory allocated with `hipMalloc`. |
||||
|
||||
## Demonstrated API Calls |
||||
|
||||
### HIP runtime |
||||
- `__global__` |
||||
- `__shared__` |
||||
|
||||
#### Device symbols |
||||
- `blockDim` |
||||
- `blockIdx` |
||||
- `threadIdx` |
||||
- `__syncthreads` |
||||
|
||||
#### Host symbols |
||||
- `hipFree` |
||||
- `hipGetLastError` |
||||
- `hipLaunchKernelGGL` |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
||||
- `hipMemcpyDeviceToHost` |
||||
- `hipMemcpyHostToDevice` |
@ -0,0 +1,160 @@
@@ -0,0 +1,160 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
|
||||
#include <algorithm> |
||||
#include <iostream> |
||||
#include <numeric> |
||||
#include <vector> |
||||
|
||||
/// \brief Transposes the matrix \p in and stores the result in \p out using static shared memory. |
||||
template<const unsigned int Width = 64> |
||||
__global__ void matrix_transpose_kernel(float* out, const float* in) |
||||
{ |
||||
// Allocate the necessary amount of shared memory to store the transpose of the matrix. |
||||
constexpr unsigned int size = Width * Width; |
||||
__shared__ float shared_matrix_memory[size]; |
||||
|
||||
// Compute the row and column indexes of the matrix element that each thread is going |
||||
// to process. |
||||
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; |
||||
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; |
||||
|
||||
// If not out of bounds, transpose element (x,y). |
||||
if(x < Width && y < Width) |
||||
{ |
||||
// Store transposed element in shared memory. |
||||
shared_matrix_memory[y * Width + x] = in[x * Width + y]; |
||||
} |
||||
|
||||
// Syncronize threads so all writes are done before accessing shared memory again. |
||||
__syncthreads(); |
||||
|
||||
// If not out of bounds, transpose element (x,y). |
||||
if(x < Width && y < Width) |
||||
{ |
||||
// Copy transposed element from shared memory to global memory. |
||||
out[y * Width + x] = shared_matrix_memory[y * Width + x]; |
||||
} |
||||
} |
||||
|
||||
// CPU implementation of matrix transpose. |
||||
std::vector<float> expected_matrix_transpose(const std::vector<float>& input, |
||||
const unsigned int width) |
||||
{ |
||||
std::vector<float> output(width * width); |
||||
for(unsigned int j = 0; j < width; j++) |
||||
{ |
||||
for(unsigned int i = 0; i < width; i++) |
||||
{ |
||||
output[i * width + j] = input[j * width + i]; |
||||
} |
||||
} |
||||
return output; |
||||
} |
||||
|
||||
int main() |
||||
{ |
||||
// Number of rows and columns, total number of elements and size in bytes of the matrix |
||||
// to be transposed. |
||||
constexpr unsigned int width = 64; |
||||
constexpr unsigned int size = width * width; |
||||
constexpr unsigned int size_bytes = size * sizeof(float); |
||||
|
||||
// Number of threads in each dimension of the kernel block. |
||||
constexpr unsigned int block_size = 4; |
||||
|
||||
// Number of blocks in each dimension of the grid. Calculated as ceil(width/block_size). |
||||
constexpr unsigned int grid_size = (width + block_size - 1) / block_size; |
||||
|
||||
// Block and grid sizes in 2D. |
||||
constexpr dim3 block_dim(block_size, block_size); |
||||
constexpr dim3 grid_dim(grid_size, grid_size); |
||||
|
||||
// Allocate host input matrix and initialize with increasing sequence 10, 20, 30, .... |
||||
std::vector<float> matrix(size); |
||||
std::iota(matrix.begin(), matrix.end(), 1.f); |
||||
std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; }); |
||||
|
||||
// Allocate matrix to store the results of the kernel execution. |
||||
std::vector<float> transposed_matrix(size); |
||||
|
||||
// Allocate input and output matrices on device. |
||||
float* d_matrix{}; |
||||
float* d_transposed_matrix{}; |
||||
HIP_CHECK(hipMalloc(&d_matrix, size_bytes)); |
||||
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes)); |
||||
|
||||
// Copy input matrix data from host to device. |
||||
HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Print trace message. |
||||
std::cout << "Computing matrix transpose." << std::endl; |
||||
|
||||
// Launch kernel on the default stream. Passing kernel arguments at the end of the |
||||
// hipLaunchKernelGGL function call. |
||||
hipLaunchKernelGGL(matrix_transpose_kernel<width>, |
||||
grid_dim, |
||||
block_dim, |
||||
0, |
||||
hipStreamDefault, |
||||
d_transposed_matrix, |
||||
d_matrix); |
||||
|
||||
// Check if the kernel launch was successful. |
||||
HIP_CHECK(hipGetLastError()); |
||||
|
||||
// Copy results from device to host. |
||||
HIP_CHECK(hipMemcpy(transposed_matrix.data(), |
||||
d_transposed_matrix, |
||||
size_bytes, |
||||
hipMemcpyDeviceToHost)); |
||||
|
||||
// Free device memory. |
||||
HIP_CHECK(hipFree(d_matrix)); |
||||
HIP_CHECK(hipFree(d_transposed_matrix)); |
||||
|
||||
// Calculate expected transposed matrix with the CPU version of the kernel. |
||||
std::vector<float> expected_transposed_matrix = expected_matrix_transpose(matrix, width); |
||||
|
||||
// Validate results comparing with expected transposed matrix. |
||||
unsigned int errors = 0; |
||||
constexpr float eps = 1.0E-6; |
||||
std::cout << "Validating transposed matrix." << std::endl; |
||||
for(unsigned int i = 0; i < size; i++) |
||||
{ |
||||
errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps); |
||||
} |
||||
|
||||
if(errors) |
||||
{ |
||||
std::cout << "Validation failed with " << errors << " errors." << std::endl; |
||||
return error_exit_code; |
||||
} |
||||
else |
||||
{ |
||||
std::cout << "Validation passed." << std::endl; |
||||
} |
||||
} |
@ -0,0 +1,99 @@
@@ -0,0 +1,99 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>shared_memory_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
hip_warp_shuffle |
@ -0,0 +1,58 @@
@@ -0,0 +1,58 @@
|
||||
# MIT License |
||||
# |
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
# |
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
set(example_name hip_warp_shuffle) |
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) |
||||
project(${example_name} LANGUAGES CXX) |
||||
|
||||
set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA") |
||||
set(GPU_RUNTIMES "HIP" "CUDA") |
||||
set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) |
||||
|
||||
if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) |
||||
set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.") |
||||
message(FATAL_ERROR ${ERROR_MESSAGE}) |
||||
endif() |
||||
|
||||
enable_language(${GPU_RUNTIME}) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD 17) |
||||
set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) |
||||
set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) |
||||
|
||||
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") |
||||
if(NOT CMAKE_PREFIX_PATH) |
||||
set(CMAKE_PREFIX_PATH "${ROCM_ROOT}") |
||||
endif() |
||||
|
||||
add_executable(${example_name} main.hip) |
||||
# Make example runnable using ctest. |
||||
add_test(${example_name} ${example_name}) |
||||
|
||||
set(include_dirs "../../Common") |
||||
# For examples targeting NVIDIA, include the HIP header directory. |
||||
if(GPU_RUNTIME STREQUAL "CUDA") |
||||
list(APPEND include_dirs "${ROCM_ROOT}/include") |
||||
endif() |
||||
|
||||
target_include_directories(${example_name} PRIVATE ${include_dirs}) |
||||
set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME}) |
@ -0,0 +1,54 @@
@@ -0,0 +1,54 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
EXAMPLE := hip_warp_shuffle |
||||
COMMON_INCLUDE_DIR := ../../Common |
||||
GPU_RUNTIME := HIP |
||||
|
||||
# HIP variables
|
||||
ROCM_INSTALL_DIR := /opt/rocm |
||||
HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include |
||||
|
||||
HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc |
||||
|
||||
# Common variables and flags
|
||||
CXX_STD := c++17 |
||||
CXXFLAGS := -std=$(CXX_STD) |
||||
CPPFLAGS := -I $(COMMON_INCLUDE_DIR) |
||||
LDFLAGS := |
||||
LDLIBS := |
||||
|
||||
ifeq ($(GPU_RUNTIME), CUDA) |
||||
CXXFLAGS += -x cu |
||||
CPPFLAGS += -isystem $(HIP_INCLUDE_DIR) |
||||
else ifeq ($(GPU_RUNTIME), HIP) |
||||
else |
||||
$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) |
||||
endif |
||||
|
||||
$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp |
||||
$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@ |
||||
|
||||
clean: |
||||
$(RM) $(EXAMPLE) |
||||
|
||||
.PHONY: clean |
@ -0,0 +1,53 @@
@@ -0,0 +1,53 @@
|
||||
# HIP-Basic Warp Shuffle Example |
||||
|
||||
## Description |
||||
Kernel code for a particular block is executed in groups of threads known as a _wavefronts_ (AMD) or _warps_ (NVIDIA). Each block is is divided into as many warps as the block's size allows. If the block size is less than the warp size, then part of the warp just stays idle (as happens in this example). AMD GPUs use 64 threads per wavefront for architectures prior to RDNA™ 1. RDNA architectures support both 32 and 64 wavefront sizes. |
||||
|
||||
Warps are executed in _lockstep_, i.e. all the threads in each warp execute the same instruction at the same time but with different data. This type of parallel processing is also known as Single Instruction, Multiple Data (SIMD). A block contains several warps and the warp size is dependent on the architecture, but the block size is not. Blocks and warps also differ in the way they are executed, and thus they may provide different results when used in the same piece of code. For instance, the kernel code of this example would not work as it is with block execution and shared memory access e.g. because some synchronization would be needed to ensure that every thread has written its correspondent value before trying to access it. |
||||
|
||||
Higher performance in the execution of kernels can be achieved with explicit warp-level programming. This can be done by using some collective operations, known as _warp shuffles_, that allow exchanging data between threads in the same warp without the need for shared memory. This exchange occurs simultaneously for all the active threads in the warp. |
||||
|
||||
This example showcases how to use the above-mentioned operations by implementing a simple matrix transpose kernel. |
||||
|
||||
### Application flow |
||||
1. A number of constants are defined for the kernel launch parameters. |
||||
2. The input and output matrices are allocated and initialized in host memory. |
||||
3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device. |
||||
4. A trace message is printed to the standard output. |
||||
5. The GPU kernel is then launched with the previously defined arguments. |
||||
6. The transposed matrix is copied back to host memory. |
||||
7. All device memory is freed. |
||||
8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output. |
||||
|
||||
## Key APIs and Concepts |
||||
Warp shuffle is a warp-level primitive that allows for the communication between the threads of a warp. Below is a simple example that shows how the value of the thread with index 2 is copied to all other threads within the warp. |
||||
 |
||||
|
||||
`__shfl(var, src_lane, width = warp_size)` copies the value of a `var` from the thread `src_lane` within the warp. This operation admits a third parameter (not used in this example), `width`, defaulted to the warp size value and which allows restricting the number of threads of the warp from which values are read. Values are copied from threads with an ID in the range $[0, width-1]$. If the ID of the thread specified in the call to `__shfl` is out of that range, then the thread accessed is the one with that ID modulo `width`. The `src_lane` may also vary per thread, as shown below. |
||||
|
||||
 |
||||
|
||||
- `hipGetDeviceProperties` gets the properties of the specified device. In this example, it is used to get the warp size of the device (GPU) used. |
||||
- `hipMalloc` allocates memory in the global memory of the device, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU). |
||||
- `hipGetLastError` gets the last error returned by any HIP runtime API call. |
||||
- `hipFree` deallocates device memory allocated with `hipMalloc`. |
||||
|
||||
## Demonstrated API Calls |
||||
|
||||
### HIP runtime |
||||
|
||||
#### Device symbols |
||||
- `__global__` |
||||
- `threadIdx` |
||||
- `__shfl` |
||||
|
||||
#### Host symbols |
||||
- `hipFree` |
||||
- `hipGetDeviceProperties` |
||||
- `hipGetLastError` |
||||
- `hipLaunchKernelGGL` |
||||
- `hipMalloc` |
||||
- `hipMemcpy` |
||||
- `hipMemcpyDeviceToHost` |
||||
- `hipMemcpyHostToDevice` |
||||
- `hipStreamDefault` |
@ -0,0 +1,156 @@
@@ -0,0 +1,156 @@
|
||||
// MIT License |
||||
// |
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. |
||||
// |
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
// of this software and associated documentation files (the "Software"), to deal |
||||
// in the Software without restriction, including without limitation the rights |
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
// copies of the Software, and to permit persons to whom the Software is |
||||
// furnished to do so, subject to the following conditions: |
||||
// |
||||
// The above copyright notice and this permission notice shall be included in all |
||||
// copies or substantial portions of the Software. |
||||
// |
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
// SOFTWARE. |
||||
|
||||
#include "example_utils.hpp" |
||||
|
||||
#include <hip/hip_runtime.h> |
||||
|
||||
#include <algorithm> |
||||
#include <iostream> |
||||
#include <numeric> |
||||
#include <vector> |
||||
|
||||
/// \brief Transposes the matrix \p in and stores the result in \p out using warp shuffle operations. |
||||
__global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width) |
||||
{ |
||||
// Compute the row and column indexes of the matrix element that each thread is going |
||||
// to process. Since in this example there is only one block, the indexes are |
||||
// precisely the thread's ID in each dimension. |
||||
const unsigned int x = threadIdx.x; |
||||
const unsigned int y = threadIdx.y; |
||||
|
||||
// If not out of bounds, transpose element. |
||||
if(x < width && y < width) |
||||
{ |
||||
// Read element from global memory. Each thread in the warp is reading the element that |
||||
// the thread with global id x * width + y will transpose. |
||||
const float val = in[y * width + x]; |
||||
|
||||
// Transpose element reading it from the correspondent thread with a shuffle operation (__shfl). |
||||
// __shfl does not require all threads to be active, so it can be inside the if block. |
||||
// Note that, since the matrix in this example has less elements than the warp size value, |
||||
// the ID within the warp of each thread matches its global ID. |
||||
out[x * width + y] = __shfl(val, y * width + x); |
||||
} |
||||
} |
||||
|
||||
/// \brief CPU implementation of matrix transpose. |
||||
std::vector<float> expected_matrix_transpose(const std::vector<float>& input, |
||||
const unsigned int width) |
||||
{ |
||||
std::vector<float> output(width * width); |
||||
for(unsigned int j = 0; j < width; j++) |
||||
{ |
||||
for(unsigned int i = 0; i < width; i++) |
||||
{ |
||||
output[i * width + j] = input[j * width + i]; |
||||
} |
||||
} |
||||
return output; |
||||
} |
||||
|
||||
int main() |
||||
{ |
||||
// Number of rows and columns, total number of elements and size in bytes of the matrix |
||||
// to be transposed. |
||||
constexpr unsigned int width = 4; |
||||
constexpr unsigned int size = width * width; |
||||
constexpr unsigned int size_bytes = size * sizeof(float); |
||||
|
||||
// Get device's warp size. |
||||
hipDeviceProp_t props; |
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0 /*device ID*/)); |
||||
|
||||
// To guarantee the correct behaviour of the program, keep total number of matrix elements |
||||
// below (or equal to) warp size. |
||||
assert(size <= props.warpSize |
||||
&& "Matrix has more elements than architecture's warp size value."); |
||||
|
||||
// Block (2D) and grid sizes. Note that in this example we have only 1 block (and 1 warp). |
||||
constexpr dim3 block_dim(width, width); |
||||
constexpr dim3 grid_dim(1); |
||||
|
||||
// Allocate host input matrix and initialize with increasing sequence 10, 20, 30, .... |
||||
std::vector<float> matrix(size); |
||||
std::iota(matrix.begin(), matrix.end(), 1.f); |
||||
std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; }); |
||||
|
||||
// Allocate matrix to store the results of the kernel execution. |
||||
std::vector<float> transposed_matrix(size); |
||||
|
||||
// Allocate input and output matrices on device. |
||||
float* d_matrix{}; |
||||
float* d_transposed_matrix{}; |
||||
HIP_CHECK(hipMalloc(&d_matrix, size_bytes)); |
||||
HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes)); |
||||
|
||||
// Copy input matrix data from host to device. |
||||
HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice)); |
||||
|
||||
// Print trace message. |
||||
std::cout << "Computing matrix transpose." << std::endl; |
||||
|
||||
// Lauching kernel from host |
||||
hipLaunchKernelGGL(matrix_transpose_kernel, |
||||
grid_dim, |
||||
block_dim, |
||||
0, |
||||
hipStreamDefault, |
||||
d_transposed_matrix, |
||||
d_matrix, |
||||
width); |
||||
|
||||
// Check if the kernel launch was successful. |
||||
HIP_CHECK(hipGetLastError()); |
||||
|
||||
// Copy results from device to host. |
||||
HIP_CHECK(hipMemcpy(transposed_matrix.data(), |
||||
d_transposed_matrix, |
||||
size_bytes, |
||||
hipMemcpyDeviceToHost)); |
||||
|
||||
// Free device memory. |
||||
HIP_CHECK(hipFree(d_matrix)); |
||||
HIP_CHECK(hipFree(d_transposed_matrix)); |
||||
|
||||
// Calculate expected transposed matrix with the CPU version of the kernel. |
||||
std::vector<float> expected_transposed_matrix = expected_matrix_transpose(matrix, width); |
||||
|
||||
// Validate results comparing with expected transposed matrix. |
||||
unsigned int errors = 0; |
||||
constexpr float eps = 1.0E-6; |
||||
std::cout << "Validating transposed matrix." << std::endl; |
||||
for(unsigned int i = 0; i < size; i++) |
||||
{ |
||||
errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps); |
||||
} |
||||
|
||||
if(errors) |
||||
{ |
||||
std::cout << "Validation failed with " << errors << " errors." << std::endl; |
||||
return error_exit_code; |
||||
} |
||||
else |
||||
{ |
||||
std::cout << "Validation passed." << std::endl; |
||||
} |
||||
} |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 22 KiB |
@ -0,0 +1,99 @@
@@ -0,0 +1,99 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.hip" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\Common\example_utils.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>warp_shuffle_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>hip_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
@ -1,174 +0,0 @@
@@ -1,174 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#ifndef SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP |
||||
#define SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP |
||||
|
||||
#include <algorithm> |
||||
#include <charconv> |
||||
#include <iostream> |
||||
#include <optional> |
||||
#include <string_view> |
||||
|
||||
// Needed for the 's' suffix of `std::string` literals.
|
||||
using namespace std::string_literals; |
||||
|
||||
/// \brief The random distribution kind selected on the command line.
|
||||
enum class Distribution |
||||
{ |
||||
uniform_int, |
||||
uniform_real, |
||||
normal, |
||||
poisson |
||||
}; |
||||
|
||||
/// \brief The set of arguments parsed from the command line.
|
||||
struct CliArguments |
||||
{ |
||||
int device_id_; |
||||
Distribution distribution_; |
||||
size_t size_; |
||||
bool print_; |
||||
}; |
||||
|
||||
/// \brief Operator overload to simply print a \p CliArguments instance.
|
||||
std::ostream& operator<<(std::ostream& os, const CliArguments& cli_args) |
||||
{ |
||||
// An immediately-invoked lambda expression selects the name of the distribution.
|
||||
const std::string_view distribution_name = [&]() |
||||
{ |
||||
switch(cli_args.distribution_) |
||||
{ |
||||
case Distribution::uniform_int: return "uniform_int"; |
||||
case Distribution::uniform_real: return "uniform_real"; |
||||
case Distribution::normal: return "normal"; |
||||
case Distribution::poisson: return "poisson"; |
||||
default: return "unknown"; |
||||
} |
||||
}(); |
||||
|
||||
// Printing the fields to the `std::ostream` object.
|
||||
return os << "Selected device id: " << cli_args.device_id_ |
||||
<< "\nSelected distribution: " << distribution_name |
||||
<< "\nSelected size: " << cli_args.size_ << "\nPrinting results: " << std::boolalpha |
||||
<< cli_args.print_; |
||||
} |
||||
|
||||
/// \brief Converts a \p std::string_view to integral type \p T.
|
||||
// Throws an exception with an error message if the conversion is unsuccessful.
|
||||
template<typename T> |
||||
T parse_integral_arg(const std::string_view arg_value) |
||||
{ |
||||
T value; |
||||
// Try to convert the string_view to an integral type. If successful, the value is written to
|
||||
// the variable `value`
|
||||
const auto conversion_result |
||||
= std::from_chars(arg_value.data(), arg_value.data() + arg_value.size(), value); |
||||
// The default constructed `std::errc` stands for successful conversion.
|
||||
if(conversion_result.ec != std::errc{}) |
||||
{ |
||||
throw std::runtime_error( |
||||
"Could not convert argument \""s.append(arg_value).append("\" to an integral value")); |
||||
} |
||||
return value; |
||||
} |
||||
|
||||
/// \brief Parses an \p std::string_view to a \p Distribution.
|
||||
/// Throws an exception with an error message if the conversion is unsuccessful.
|
||||
Distribution parse_distribution_arg(const std::string_view distribution_arg) |
||||
{ |
||||
if(distribution_arg == "uniform_int") |
||||
{ |
||||
return Distribution::uniform_int; |
||||
} |
||||
if(distribution_arg == "uniform_real") |
||||
{ |
||||
return Distribution::uniform_real; |
||||
} |
||||
if(distribution_arg == "normal") |
||||
{ |
||||
return Distribution::normal; |
||||
} |
||||
if(distribution_arg == "poisson") |
||||
{ |
||||
return Distribution::poisson; |
||||
} |
||||
throw std::runtime_error( |
||||
"Argument \""s.append(distribution_arg).append("\" is not a valid distribution")); |
||||
} |
||||
|
||||
/// \brief Parses the array of command line arguments to parameters consumed by the rest
|
||||
/// of the program. \p argc must be set to the size of the \p argv array. Each pointer in
|
||||
/// the \p argv array must point to a valid null-terminated string containing the argument.
|
||||
CliArguments parse_args(const int argc, const char** argv) |
||||
{ |
||||
// Pointers fulfill the random access iterator traits, thereby can be used with the
|
||||
// standard algorithms.
|
||||
const char** argv_end = argv + argc; |
||||
|
||||
// This local function searches for `arg_name` in the argument array and returns true if found.
|
||||
const auto find_argument = [&](const std::string_view arg_name) |
||||
{ |
||||
const auto arg_name_it = std::find(argv, argv_end, arg_name); |
||||
return arg_name_it != argv_end; |
||||
}; |
||||
|
||||
// This local function searches for `arg_name` in the argument array. If found, it returns a pointer
|
||||
// to the next argument -- that is assumed to be the provided value. Otherwise returns a null optional.
|
||||
// If the found argument is the last one, an exception with an error message is thrown.
|
||||
const auto find_argument_value |
||||
= [&](const std::string_view arg_name) -> std::optional<std::string_view> |
||||
{ |
||||
const auto arg_name_it = std::find(argv, argv_end, arg_name); |
||||
if(arg_name_it == argv_end) |
||||
{ |
||||
return std::nullopt; |
||||
} |
||||
// std::next returns the iterator copied and advanced by one
|
||||
const auto arg_value_it = std::next(arg_name_it); |
||||
if(arg_value_it == argv_end) |
||||
{ |
||||
throw std::runtime_error("Value for argument is not supplied: "s.append(arg_name)); |
||||
} |
||||
return std::make_optional(*arg_value_it); |
||||
}; |
||||
|
||||
// The options below need provided values, thereby `find_argument_value` is used.
|
||||
const auto device_arg = find_argument_value("--device").value_or("0"); |
||||
const auto distribution_arg = find_argument_value("--distribution").value_or("uniform_int"); |
||||
const auto size_arg = find_argument_value("--size").value_or("10000000"); |
||||
|
||||
// The option below is just a flag. Its existence is checked by `find_argument`.
|
||||
const bool print_arg = find_argument("--print"); |
||||
|
||||
// Parse the arguments read to the corresponding type and return.
|
||||
return {parse_integral_arg<int>(device_arg), |
||||
parse_distribution_arg(distribution_arg), |
||||
parse_integral_arg<size_t>(size_arg), |
||||
print_arg}; |
||||
} |
||||
|
||||
constexpr std::string_view cli_usage_message |
||||
= "Usage: simple_distributions_cpp [--device <device_id>] [--distribution " |
||||
"{uniform_int|uniform_real|normal|poisson}] [--size <size>] [--print]"; |
||||
|
||||
#endif // SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP
|
@ -1,102 +1,104 @@
@@ -1,102 +1,104 @@
|
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.cpp" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\..\Common\example_utils.hpp" /> |
||||
<ClInclude Include="argument_parsing.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{13bb009a-0679-49c0-a763-3f0a388ea78f}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>simple_distributions_cpp_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>rocrand_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>rocrand_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
<?xml version="1.0" encoding="utf-8"?> |
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> |
||||
<ItemGroup Label="ProjectConfigurations"> |
||||
<ProjectConfiguration Include="Debug|x64"> |
||||
<Configuration>Debug</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
<ProjectConfiguration Include="Release|x64"> |
||||
<Configuration>Release</Configuration> |
||||
<Platform>x64</Platform> |
||||
</ProjectConfiguration> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClCompile Include="main.cpp" /> |
||||
</ItemGroup> |
||||
<ItemGroup> |
||||
<ClInclude Include="..\..\..\Common\example_utils.hpp" /> |
||||
<ClInclude Include="argument_parsing.hpp" /> |
||||
</ItemGroup> |
||||
<PropertyGroup Label="Globals"> |
||||
<VCProjectVersion>15.0</VCProjectVersion> |
||||
<ProjectGuid>{13bb009a-0679-49c0-a763-3f0a388ea78f}</ProjectGuid> |
||||
<Keyword>Win32Proj</Keyword> |
||||
<RootNamespace>simple_distributions_cpp_vs2019</RootNamespace> |
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>true</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> |
||||
<ConfigurationType>Application</ConfigurationType> |
||||
<UseDebugLibraries>false</UseDebugLibraries> |
||||
<PlatformToolset>HIP</PlatformToolset> |
||||
<WholeProgramOptimization>true</WholeProgramOptimization> |
||||
<CharacterSet>Unicode</CharacterSet> |
||||
</PropertyGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
||||
<ImportGroup Label="ExtensionSettings"> |
||||
<Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="Shared"> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
||||
</ImportGroup> |
||||
<PropertyGroup Label="UserMacros" /> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<LinkIncremental>true</LinkIncremental> |
||||
<TargetName>rocrand_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<LinkIncremental>false</LinkIncremental> |
||||
<TargetName>rocrand_$(ProjectName)</TargetName> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<TargetGPUArchitectures>gfx1030</TargetGPUArchitectures> |
||||
</PropertyGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level1</WarningLevel> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
||||
<ClCompile> |
||||
<WarningLevel>Level2</WarningLevel> |
||||
<FunctionLevelLinking>true</FunctionLevelLinking> |
||||
<IntrinsicFunctions>true</IntrinsicFunctions> |
||||
<PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> |
||||
<AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> |
||||
<LanguageStandard>stdcpp17</LanguageStandard> |
||||
<RuntimeTypeInfo>true</RuntimeTypeInfo> |
||||
</ClCompile> |
||||
<Link> |
||||
<SubSystem>Console</SubSystem> |
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding> |
||||
<OptimizeReferences>true</OptimizeReferences> |
||||
<GenerateDebugInformation>true</GenerateDebugInformation> |
||||
<AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> |
||||
</Link> |
||||
</ItemDefinitionGroup> |
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
||||
<ImportGroup Label="ExtensionTargets"> |
||||
<Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" /> |
||||
</ImportGroup> |
||||
</Project> |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue