diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a4d2732e..6dd911d8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -48,9 +48,10 @@ clang-format:
   script:
     - cd $CI_PROJECT_DIR
     - git config --global --add safe.directory $CI_PROJECT_DIR
-    - scripts/code-format/check-format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT"
+    - Scripts/CodeFormat/check_format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT"
 
 .build:dockerfiles:
+  timeout: 20m
   image:
     name: gcr.io/kaniko-project/executor:debug
     entrypoint: [""]
@@ -209,7 +210,7 @@ test:rocm-windows-cmake:
       -S "$CI_PROJECT_DIR"
       -B "$CI_PROJECT_DIR/build"
       -G Ninja
-      -D CMAKE_BUILD_TYPE="$CONFIG"
+      -D CMAKE_BUILD_TYPE="$BUILD_TYPE"
       -D CMAKE_HIP_ARCHITECTURES=gfx1030
       -D CMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/10/bin/10.0.19041.0/x64/rc.exe"
     - cmake --build "$CI_PROJECT_DIR/build"
diff --git a/Common/cmdparser.hpp b/Common/cmdparser.hpp
new file mode 100644
index 00000000..a2a566b8
--- /dev/null
+++ b/Common/cmdparser.hpp
@@ -0,0 +1,768 @@
+// MIT License
+//
+// Copyright (c) 2015 - 2016 Florian Rappl
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+  This file is part of the C++ CmdParser utility.
+  Copyright (c) 2015 - 2019 Florian Rappl
+*/
+
+#pragma once
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace cli
+{
+/// Class used to wrap integer types to specify desired numerical base for specific argument parsing
+template<typename T, int numericalBase = 0>
+class NumericalBase
+{
+public:
+    /// This constructor required for correct AgrumentCountChecker initialization
+    NumericalBase() : value(0), base(numericalBase) {}
+
+    /// This constructor required for default value initialization
+    /// \param val comes from default value
+    NumericalBase(T val) : value(val), base(numericalBase) {}
+
+    operator T() const
+    {
+        return this->value;
+    }
+    operator T*()
+    {
+        return this->value;
+    }
+
+    T            value;
+    unsigned int base;
+};
+
+struct CallbackArgs
+{
+    const std::vector<std::string>& arguments;
+    std::ostream&                   output;
+    std::ostream&                   error;
+};
+class Parser
+{
+private:
+    class CmdBase
+    {
+    public:
+        explicit CmdBase(const std::string& name,
+                         const std::string& alternative,
+                         const std::string& description,
+                         bool               required,
+                         bool               dominant,
+                         bool               variadic)
+            : name(name)
+            , command(name.size() > 0 ? "-" + name : "")
+            , alternative(alternative.size() > 0 ? "--" + alternative : "")
+            , description(description)
+            , required(required)
+            , handled(false)
+            , arguments({})
+            , dominant(dominant)
+            , variadic(variadic)
+        {}
+
+        virtual ~CmdBase() {}
+
+        std::string              name;
+        std::string              command;
+        std::string              alternative;
+        std::string              description;
+        bool                     required;
+        bool                     handled;
+        std::vector<std::string> arguments;
+        bool const               dominant;
+        bool const               variadic;
+
+        virtual std::string print_value() const                              = 0;
+        virtual bool        parse(std::ostream& output, std::ostream& error) = 0;
+
+        bool is(const std::string& given) const
+        {
+            return given == command || given == alternative;
+        }
+    };
+
+    template<typename T>
+    struct ArgumentCountChecker
+    {
+        static constexpr bool Variadic = false;
+    };
+
+    template<typename T>
+    struct ArgumentCountChecker<cli::NumericalBase<T>>
+    {
+        static constexpr bool Variadic = false;
+    };
+
+    template<typename T>
+    struct ArgumentCountChecker<std::vector<T>>
+    {
+        static constexpr bool Variadic = true;
+    };
+
+    template<typename T>
+    class CmdFunction final : public CmdBase
+    {
+    public:
+        explicit CmdFunction(const std::string& name,
+                             const std::string& alternative,
+                             const std::string& description,
+                             bool               required,
+                             bool               dominant)
+            : CmdBase(name,
+                      alternative,
+                      description,
+                      required,
+                      dominant,
+                      ArgumentCountChecker<T>::Variadic)
+        {}
+
+        virtual bool parse(std::ostream& output, std::ostream& error)
+        {
+            try
+            {
+                CallbackArgs args{arguments, output, error};
+                value = callback(args);
+                return true;
+            }
+            catch(...)
+            {
+                return false;
+            }
+        }
+
+        virtual std::string print_value() const
+        {
+            return "";
+        }
+
+        std::function<T(CallbackArgs&)> callback;
+        T                               value;
+    };
+
+    template<typename T>
+    class CmdArgument final : public CmdBase
+    {
+    public:
+        explicit CmdArgument(const std::string& name,
+                             const std::string& alternative,
+                             const std::string& description,
+                             bool               required,
+                             bool               dominant)
+            : CmdBase(name,
+                      alternative,
+                      description,
+                      required,
+                      dominant,
+                      ArgumentCountChecker<T>::Variadic)
+        {}
+
+        virtual bool parse(std::ostream&, std::ostream&)
+        {
+            try
+            {
+                value = Parser::parse(arguments, value);
+                return true;
+            }
+            catch(...)
+            {
+                return false;
+            }
+        }
+
+        virtual std::string print_value() const
+        {
+            return stringify(value);
+        }
+
+        T value;
+    };
+
+    static int parse(const std::vector<std::string>& elements, const int&, int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stoi(elements[0], 0, numberBase);
+    }
+
+    static bool parse(const std::vector<std::string>& elements, const bool& defval)
+    {
+        if(elements.size() != 0)
+            throw std::runtime_error("A boolean command line parameter cannot have any arguments.");
+
+        return !defval;
+    }
+
+    static double parse(const std::vector<std::string>& elements, const double&)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stod(elements[0]);
+    }
+
+    static float parse(const std::vector<std::string>& elements, const float&)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stof(elements[0]);
+    }
+
+    static long double parse(const std::vector<std::string>& elements, const long double&)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stold(elements[0]);
+    }
+
+    static unsigned int
+        parse(const std::vector<std::string>& elements, const unsigned int&, int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return static_cast<unsigned int>(std::stoul(elements[0], 0, numberBase));
+    }
+
+    static unsigned long
+        parse(const std::vector<std::string>& elements, const unsigned long&, int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stoul(elements[0], 0, numberBase);
+    }
+
+    static unsigned long long parse(const std::vector<std::string>& elements,
+                                    const unsigned long long&,
+                                    int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stoull(elements[0], 0, numberBase);
+    }
+
+    static long long
+        parse(const std::vector<std::string>& elements, const long long&, int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stoll(elements[0], 0, numberBase);
+    }
+
+    static long parse(const std::vector<std::string>& elements, const long&, int numberBase = 0)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return std::stol(elements[0], 0, numberBase);
+    }
+
+    static std::string parse(const std::vector<std::string>& elements, const std::string&)
+    {
+        if(elements.size() != 1)
+            throw std::bad_cast();
+
+        return elements[0];
+    }
+
+    template<class T>
+    static std::vector<T> parse(const std::vector<std::string>& elements, const std::vector<T>&)
+    {
+        const T                  defval = T();
+        std::vector<T>           values{};
+        std::vector<std::string> buffer(1);
+
+        for(const auto& element : elements)
+        {
+            buffer[0] = element;
+            values.push_back(parse(buffer, defval));
+        }
+
+        return values;
+    }
+
+    template<typename T>
+    static T parse(const std::vector<std::string>& elements, const NumericalBase<T>& wrapper)
+    {
+        return parse(elements, wrapper.value, 0);
+    }
+
+    /// Specialization for number wrapped into numerical base
+    /// \tparam T base type of the argument
+    /// \tparam base numerical base
+    /// \param elements
+    /// \param wrapper
+    /// \return parsed number
+    template<typename T, int base>
+    static T parse(const std::vector<std::string>& elements, const NumericalBase<T, base>& wrapper)
+    {
+        return parse(elements, wrapper.value, wrapper.base);
+    }
+
+    template<class T>
+    static std::string stringify(const T& value)
+    {
+        return std::to_string(value);
+    }
+
+    template<class T, int base>
+    static std::string stringify(const NumericalBase<T, base>& wrapper)
+    {
+        return std::to_string(wrapper.value);
+    }
+
+    template<class T>
+    static std::string stringify(const std::vector<T>& values)
+    {
+        std::stringstream ss{};
+        ss << "[ ";
+
+        for(const auto& value : values)
+        {
+            ss << stringify(value) << " ";
+        }
+
+        ss << "]";
+        return ss.str();
+    }
+
+    static std::string stringify(const std::string& str)
+    {
+        return str;
+    }
+
+public:
+    explicit Parser(int argc, const char** argv) : _appname(argv[0])
+    {
+        for(int i = 1; i < argc; ++i)
+        {
+            _arguments.push_back(argv[i]);
+        }
+        enable_help();
+    }
+
+    explicit Parser(int argc, char** argv) : _appname(argv[0])
+    {
+        for(int i = 1; i < argc; ++i)
+        {
+            _arguments.push_back(argv[i]);
+        }
+        enable_help();
+    }
+
+    Parser(int argc, const char** argv, std::string generalProgramDescriptionForHelpText)
+        : _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText))
+    {
+        for(int i = 1; i < argc; ++i)
+        {
+            _arguments.push_back(argv[i]);
+        }
+        enable_help();
+    }
+
+    Parser(int argc, char** argv, std::string generalProgramDescriptionForHelpText)
+        : _appname(argv[0]), _general_help_text(std::move(generalProgramDescriptionForHelpText))
+    {
+        for(int i = 1; i < argc; ++i)
+        {
+            _arguments.push_back(argv[i]);
+        }
+        enable_help();
+    }
+
+    ~Parser()
+    {
+        for(size_t i = 0, n = _commands.size(); i < n; ++i)
+        {
+            delete _commands[i];
+        }
+    }
+
+    bool has_help() const
+    {
+        for(const auto& command : _commands)
+        {
+            if(command->name == "h" && command->alternative == "--help")
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    void enable_help()
+    {
+        set_callback("h",
+                     "help",
+                     std::function<bool(CallbackArgs&)>(
+                         [this](CallbackArgs& args)
+                         {
+                             args.output << this->usage();
+#pragma warning(push)
+#pragma warning(disable : 4702)
+                             exit(0);
+                             return false;
+#pragma warning(pop)
+                         }),
+                     "",
+                     true);
+    }
+
+    void disable_help()
+    {
+        for(auto command = _commands.begin(); command != _commands.end(); ++command)
+        {
+            if((*command)->name == "h" && (*command)->alternative == "--help")
+            {
+                _commands.erase(command);
+                break;
+            }
+        }
+    }
+
+    template<typename T>
+    void set_default(bool is_required, const std::string& description = "")
+    {
+        auto command = new CmdArgument<T>{"", "", description, is_required, false};
+        _commands.push_back(command);
+    }
+
+    template<typename T>
+    void set_required(const std::string& name,
+                      const std::string& alternative,
+                      const std::string& description = "",
+                      bool               dominant    = false)
+    {
+        auto command = new CmdArgument<T>{name, alternative, description, true, dominant};
+        _commands.push_back(command);
+    }
+
+    template<typename T>
+    void set_optional(const std::string& name,
+                      const std::string& alternative,
+                      T                  defaultValue,
+                      const std::string& description = "",
+                      bool               dominant    = false)
+    {
+        auto command   = new CmdArgument<T>{name, alternative, description, false, dominant};
+        command->value = defaultValue;
+        _commands.push_back(command);
+    }
+
+    template<typename T>
+    void set_callback(const std::string&              name,
+                      const std::string&              alternative,
+                      std::function<T(CallbackArgs&)> callback,
+                      const std::string&              description = "",
+                      bool                            dominant    = false)
+    {
+        auto command      = new CmdFunction<T>{name, alternative, description, false, dominant};
+        command->callback = callback;
+        _commands.push_back(command);
+    }
+
+    inline void run_and_exit_if_error()
+    {
+        if(run() == false)
+        {
+            exit(1);
+        }
+    }
+
+    inline bool run()
+    {
+        return run(std::cout, std::cerr);
+    }
+
+    inline bool run(std::ostream& output)
+    {
+        return run(output, std::cerr);
+    }
+
+    bool doesArgumentExist(std::string name, std::string altName)
+    {
+        for(const auto& argument : _arguments)
+        {
+
+            if(argument == '-' + name || argument == altName)
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    inline bool doesHelpExist()
+    {
+        return doesArgumentExist("h", "--help");
+    }
+
+    bool run(std::ostream& output, std::ostream& error)
+    {
+        if(_arguments.size() > 0)
+        {
+            auto current = find_default();
+
+            for(size_t i = 0, n = _arguments.size(); i < n; ++i)
+            {
+                auto isarg      = _arguments[i].size() > 0 && _arguments[i][0] == '-';
+                auto associated = isarg ? find(_arguments[i]) : nullptr;
+
+                if(associated != nullptr)
+                {
+                    current             = associated;
+                    associated->handled = true;
+                }
+                else if(current == nullptr)
+                {
+                    error << no_default();
+                    return false;
+                }
+                else
+                {
+                    current->arguments.push_back(_arguments[i]);
+                    current->handled = true;
+                    if(!current->variadic)
+                    {
+                        // If the current command is not variadic, then no more arguments
+                        // should be added to it. In this case, switch back to the default
+                        // command.
+                        current = find_default();
+                    }
+                }
+            }
+        }
+
+        // First, parse dominant arguments since they succeed even if required
+        // arguments are missing.
+        for(auto command : _commands)
+        {
+            if(command->handled && command->dominant && !command->parse(output, error))
+            {
+                error << howto_use(command);
+                return false;
+            }
+        }
+
+        // Next, check for any missing arguments.
+        for(auto command : _commands)
+        {
+            if(command->required && !command->handled)
+            {
+                error << howto_required(command);
+                return false;
+            }
+        }
+
+        // Finally, parse all remaining arguments.
+        for(auto command : _commands)
+        {
+            if(command->handled && !command->dominant && !command->parse(output, error))
+            {
+                error << howto_use(command);
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template<typename T>
+    T get(const std::string& name) const
+    {
+        for(const auto& command : _commands)
+        {
+            if(command->name == name)
+            {
+                auto cmd = dynamic_cast<CmdArgument<T>*>(command);
+
+                if(cmd == nullptr)
+                {
+                    throw std::runtime_error("Invalid usage of the parameter " + name
+                                             + " detected.");
+                }
+
+                return cmd->value;
+            }
+        }
+
+        throw std::runtime_error("The parameter " + name + " could not be found.");
+    }
+
+    template<typename T>
+    T get_if(const std::string& name, std::function<T(T)> callback) const
+    {
+        auto value = get<T>(name);
+        return callback(value);
+    }
+
+    int requirements() const
+    {
+        int count = 0;
+
+        for(const auto& command : _commands)
+        {
+            if(command->required)
+            {
+                ++count;
+            }
+        }
+
+        return count;
+    }
+
+    int commands() const
+    {
+        return static_cast<int>(_commands.size());
+    }
+
+    inline const std::string& app_name() const
+    {
+        return _appname;
+    }
+
+protected:
+    CmdBase* find(const std::string& name)
+    {
+        for(auto command : _commands)
+        {
+            if(command->is(name))
+            {
+                return command;
+            }
+        }
+
+        return nullptr;
+    }
+
+    CmdBase* find_default()
+    {
+        for(auto command : _commands)
+        {
+            if(command->name == "")
+            {
+                return command;
+            }
+        }
+
+        return nullptr;
+    }
+
+    std::string usage() const
+    {
+        std::stringstream ss{};
+        ss << _general_help_text << "\n\n";
+        ss << "Available parameters:\n\n";
+
+        for(const auto& command : _commands)
+        {
+            ss << "  " << command->command << "\t" << command->alternative;
+
+            if(command->required == true)
+            {
+                ss << "\t(required)";
+            }
+
+            ss << "\n   " << command->description;
+
+            if(command->required == false)
+            {
+                ss << "\n   "
+                   << "This parameter is optional. The default value is '" + command->print_value()
+                   << "'.";
+            }
+
+            ss << "\n\n";
+        }
+
+        return ss.str();
+    }
+
+    void print_help(std::stringstream& ss) const
+    {
+        if(has_help())
+        {
+            ss << "For more help use --help or -h.\n";
+        }
+    }
+
+    std::string howto_required(CmdBase* command) const
+    {
+        std::stringstream ss{};
+        ss << "The parameter " << command->name << " is required.\n";
+        ss << command->description << '\n';
+        print_help(ss);
+        return ss.str();
+    }
+
+    std::string howto_use(CmdBase* command) const
+    {
+        std::stringstream ss{};
+        ss << "The parameter " << command->name << " has invalid arguments.\n";
+        ss << command->description << '\n';
+        print_help(ss);
+        return ss.str();
+    }
+
+    std::string no_default() const
+    {
+        std::stringstream ss{};
+        ss << "No default parameter has been specified.\n";
+        ss << "The given argument must be used with a parameter.\n";
+        print_help(ss);
+        return ss.str();
+    }
+
+    const std::string& get_general_help_text() const
+    {
+        return _general_help_text;
+    }
+
+    void set_general_help_text(const std::string& generalHelpText)
+    {
+        _general_help_text = generalHelpText;
+    }
+
+private:
+    const std::string        _appname;
+    std::string              _general_help_text;
+    std::vector<std::string> _arguments;
+    std::vector<CmdBase*>    _commands;
+};
+} // namespace cli
\ No newline at end of file
diff --git a/Common/example_utils.hpp b/Common/example_utils.hpp
index c0c3f24a..9e555e50 100644
--- a/Common/example_utils.hpp
+++ b/Common/example_utils.hpp
@@ -99,4 +99,61 @@ std::string format_pairs(const BidirectionalIteratorT begin_a,
     return sstream.str();
 }
 
+/// \brief A function to parse a string for an int. If the string is a valid integer then return true
+/// else if it has non-numeric character then return false.
+bool parse_int_string(const std::string& str, int& out)
+{
+    try
+    {
+        size_t end;
+        int    value = std::stoi(str, &end);
+        if(end == str.size())
+        {
+            out = value;
+            return true;
+        }
+        return false;
+    }
+    catch(const std::exception&)
+    {
+        return false;
+    }
+}
+
+/// \brief A class to measures time between intervals
+class HostClock
+{
+private:
+    std::chrono::steady_clock::time_point start_time;
+    std::chrono::steady_clock::duration   elapsed_time;
+
+public:
+    HostClock()
+    {
+        this->reset_timer();
+    }
+
+    void reset_timer()
+    {
+        this->elapsed_time = std::chrono::steady_clock::duration(0);
+    }
+
+    void start_timer()
+    {
+        this->start_time = std::chrono::steady_clock::now();
+    }
+    void stop_timer()
+    {
+        const auto end_time = std::chrono::steady_clock::now();
+        this->elapsed_time += end_time - this->start_time;
+    }
+
+    /// @brief Returns time elapsed in Seconds
+    /// @return type double that contains the elapsed time in Seconds
+    double get_elapsed_time() const
+    {
+        return std::chrono::duration_cast<std::chrono::duration<double>>(this->elapsed_time)
+            .count();
+    }
+};
 #endif // COMMON_EXAMPLE_UTILS_HPP
diff --git a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
index c19f64ed..291824d9 100644
--- a/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-cuda-ubuntu.Dockerfile
@@ -13,12 +13,17 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
         ssh \
         sudo \
         wget \
+        pkg-config \
+        glslang-tools \
+        libvulkan-dev \
+        vulkan-validationlayers \
+        libglfw3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 # Install HIP using the installer script
 RUN export DEBIAN_FRONTEND=noninteractive; \
     wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.2/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
+    && echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.3/ ubuntu main' > /etc/apt/sources.list.d/rocm.list \
     && apt-get update -qq \
     && apt-get install -y hip-base hipify-clang \
     && apt-get download hip-runtime-nvidia hip-dev \
@@ -36,25 +41,25 @@ RUN echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf \
     && ldconfig
 
 # Install rocRAND
-RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.2.0.tar.gz \
-    && tar -xf ./rocm-5.2.0.tar.gz \
-    && rm ./rocm-5.2.0.tar.gz \
-    && cmake -S ./rocRAND-rocm-5.2.0 -B ./rocRAND-rocm-5.2.0/build \
-        -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \
+RUN wget https://github.com/ROCmSoftwarePlatform/rocRAND/archive/refs/tags/rocm-5.3.0.tar.gz \
+    && tar -xf ./rocm-5.3.0.tar.gz \
+    && rm ./rocm-5.3.0.tar.gz \
+    && cmake -S ./rocRAND-rocm-5.3.0 -B ./rocRAND-rocm-5.3.0/build \
+        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D BUILD_HIPRAND=OFF \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./rocRAND-rocm-5.2.0/build --target install \
-    && rm -rf ./rocRAND-rocm-5.2.0
+    && cmake --build ./rocRAND-rocm-5.3.0/build --target install \
+    && rm -rf ./rocRAND-rocm-5.3.0
 
 # Install hipCUB
-RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.2.0.tar.gz \
-    && tar -xf ./rocm-5.2.0.tar.gz \
-    && rm ./rocm-5.2.0.tar.gz \
-    && cmake -S ./hipCUB-rocm-5.2.0 -B ./hipCUB-rocm-5.2.0/build \
-        -D CMAKE_MODULE_PATH=/opt/rocm/lib/cmake/hip \
+RUN wget https://github.com/ROCmSoftwarePlatform/hipCUB/archive/refs/tags/rocm-5.3.0.tar.gz \
+    && tar -xf ./rocm-5.3.0.tar.gz \
+    && rm ./rocm-5.3.0.tar.gz \
+    && cmake -S ./hipCUB-rocm-5.3.0 -B ./hipCUB-rocm-5.3.0/build \
+        -D CMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -D CMAKE_INSTALL_PREFIX=/opt/rocm \
-    && cmake --build ./hipCUB-rocm-5.2.0/build --target install \
-    && rm -rf ./hipCUB-rocm-5.2.0
+    && cmake --build ./hipCUB-rocm-5.3.0/build --target install \
+    && rm -rf ./hipCUB-rocm-5.3.0
 
 # Add the render group and a user with sudo permissions for the container
 RUN groupadd --system --gid 109 render \
diff --git a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
index ce57678e..9b247978 100644
--- a/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
+++ b/Dockerfiles/hip-libraries-rocm-ubuntu.Dockerfile
@@ -13,16 +13,21 @@ RUN export DEBIAN_FRONTEND=noninteractive; \
         ssh \
         sudo \
         wget \
+        pkg-config \
+        glslang-tools \
+        libvulkan-dev \
+        vulkan-validationlayers \
+        libglfw3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 ENV LANG en_US.utf8
 
 # Install ROCM HIP and libraries using the installer script
 RUN export DEBIAN_FRONTEND=noninteractive; \
-    wget https://repo.radeon.com/amdgpu-install/22.20/ubuntu/focal/amdgpu-install_22.20.50200-1_all.deb \
+    wget https://repo.radeon.com/amdgpu-install/5.3/ubuntu/focal/amdgpu-install_5.3.50300-1_all.deb \
     && apt-get update -qq \
-    && apt-get install -y ./amdgpu-install_22.20.50200-1_all.deb \
-    && rm ./amdgpu-install_22.20.50200-1_all.deb \
+    && apt-get install -y ./amdgpu-install_5.3.50300-1_all.deb \
+    && rm ./amdgpu-install_5.3.50300-1_all.deb \
     && amdgpu-install -y --usecase=hiplibsdk --no-dkms \
     && apt-get install -y libnuma-dev \
     && rm -rf /var/lib/apt/lists/*
diff --git a/docs/CONTRIBUTING.md b/Docs/CONTRIBUTING.md
similarity index 74%
rename from docs/CONTRIBUTING.md
rename to Docs/CONTRIBUTING.md
index fcd45efa..70a8dcb0 100644
--- a/docs/CONTRIBUTING.md
+++ b/Docs/CONTRIBUTING.md
@@ -13,12 +13,16 @@ Every example has to be able to be built separately from the others, but also ha
 ## Code Format
 The formatting rules of the examples are enforced by `clang-format` using the `.clang-format` file in the top-level directory.
 
-## Naming Conventions
+## Variable Naming Conventions
 - Use `lower_snake_case` style to name variables and functions (e.g. block_size, multiply_kernel and multiply_host).
 - Use `PascalCase` for `class`, `struct`, `enum` and template argument definitions.
 
-## Binary Naming Conventions
-Use the prefix of the library for the name of the binary, so that there are no conflicts between libraries (e.g. hipcub_device_sum and rocprim_device_sum).
+## File and Directory Naming Conventions
+- Top-level directories use `PascalCase`.
+- The directories in Libraries/ should use the exact name of the library they represent, including casing. If any directory does not represent a library (`exampleLibraryTemplate`), it should named in `camelCase`.
+- Directories for individual examples use `snake_case`.
+- Files generally use `snake_case`, with the exception of files for which an existing convention already applies (`README.md`, `LICENSE.md`, `CMakeLists.txt`, etc).
+- Example binaries should be prefixed with the library name of the binary, so that there are no conflicts between libraries (e.g. `hipcub_device_sum` and `rocprim_device_sum`).
 
 ## Utilities
 Utility-functions (printing vectors, etc) and common error-handling code, that is used by all examples, should be moved to the common utility-header [example_utils.hpp](../Common/example_utils.hpp).
diff --git a/HIP-Basic/CMakeLists.txt b/HIP-Basic/CMakeLists.txt
index 959a1d22..34abda25 100644
--- a/HIP-Basic/CMakeLists.txt
+++ b/HIP-Basic/CMakeLists.txt
@@ -23,6 +23,13 @@
 cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 project(HIP-Basic)
 
+# Only supported on HIP (not CUDA)
+if(NOT "${GPU_RUNTIME}" STREQUAL "CUDA")
+    add_subdirectory(assembly_to_executable)
+    add_subdirectory(llvm_ir_to_executable)
+endif()
+
+add_subdirectory(bandwidth)
 add_subdirectory(device_query)
 add_subdirectory(dynamic_shared)
 add_subdirectory(events)
@@ -32,5 +39,8 @@ if(NOT WIN32)
 endif()
 add_subdirectory(matrix_multiplication)
 add_subdirectory(occupancy)
+add_subdirectory(runtime_compilation)
 add_subdirectory(saxpy)
+add_subdirectory(shared_memory)
 add_subdirectory(streams)
+add_subdirectory(warp_shuffle)
diff --git a/HIP-Basic/Makefile b/HIP-Basic/Makefile
index 0a07e38c..b04c0749 100644
--- a/HIP-Basic/Makefile
+++ b/HIP-Basic/Makefile
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 EXAMPLES := \
+	bandwidth \
 	device_query \
 	dynamic_shared \
 	events \
@@ -28,8 +29,18 @@ EXAMPLES := \
 	hipify \
 	matrix_multiplication \
 	occupancy \
+	runtime_compilation \
 	saxpy \
-	streams
+	shared_memory \
+	streams \
+	warp_shuffle
+
+# Only supported on HIP (not CUDA).
+ifneq ($(GPU_RUNTIME), CUDA)
+    EXAMPLES += \
+        assembly_to_executable \
+        llvm_ir_to_executable
+endif
 
 all: $(EXAMPLES)
 
diff --git a/HIP-Basic/assembly_to_executable/.gitignore b/HIP-Basic/assembly_to_executable/.gitignore
new file mode 100644
index 00000000..b5b29f16
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/.gitignore
@@ -0,0 +1,3 @@
+hip_assembly_to_executable
+*.o
+*.hipfb
diff --git a/HIP-Basic/assembly_to_executable/CMakeLists.txt b/HIP-Basic/assembly_to_executable/CMakeLists.txt
new file mode 100644
index 00000000..e93c5f57
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/CMakeLists.txt
@@ -0,0 +1,174 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_assembly_to_executable)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+
+# Only supported on HIP (not CUDA)
+if(NOT "${GPU_RUNTIME}" STREQUAL "HIP")
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+if (NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for")
+else()
+    set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for")
+endif()
+
+if(GPU_ARCHITECTURES STREQUAL "all")
+    set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE)
+endif()
+
+# Remove duplicates.
+list(REMOVE_DUPLICATES GPU_ARCHITECTURES)
+message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}")
+
+set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only")
+
+if (WIN32)
+    set(OBJ_TYPE obj)
+    set(NULDEV NUL)
+    set(HOST_TARGET x86_64-pc-windows-msvc)
+    set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin)
+else()
+    set(OBJ_TYPE o)
+    set(NULDEV /dev/null)
+    set(HOST_TARGET x86_64-unknown-linux)
+    set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin)
+endif()
+
+# Assemble the device assemblies to object files using the HIP compiler.
+# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file
+# for the right GPU.
+foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES})
+    message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+    add_custom_command(
+        OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+        COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE}
+                        ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s
+                        -o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.s
+        VERBATIM)
+endforeach()
+
+# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool.
+find_program(
+    OFFLOAD_BUNDLER_COMMAND clang-offload-bundler
+    PATH_SUFFIXES bin
+    PATHS
+    ${ROCM_ROOT}/llvm
+    ${CMAKE_INSTALL_PREFIX}/llvm
+    REQUIRED)
+
+if(OFFLOAD_BUNDLER_COMMAND)
+    message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}")
+else()
+    message(FATAL_ERROR "clang-offload-bundler not found")
+endif()
+
+# Generate object bundle.
+# The invocation to generate is
+# clang-offload-bundler -targets=<targets> -input=<input target #1> -inputs=<input target #2> ... -output=<output>
+# Note that the host target must be the first target present here, and it should have an empty input associated to it.
+
+# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},...
+set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}")
+# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ...
+set(BUNDLE_INPUTS "-input=${NULDEV}")
+# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+set(BUNDLE_OBJECTS "")
+foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES})
+    set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}")
+    list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+    list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+endforeach()
+
+# Invoke clang-offload-bundler to generate an offload bundle.
+set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb")
+add_custom_command(
+    OUTPUT "${BUNDLE}"
+    COMMAND
+        "${OFFLOAD_BUNDLER_COMMAND}"
+        -type=o
+        -bundle-align=4096
+        "${BUNDLE_TARGETS}"
+        ${BUNDLE_INPUTS}
+        "-output=${BUNDLE}"
+    DEPENDS ${BUNDLE_OBJECTS}
+    VERBATIM)
+
+# Create the device binary by assembling the template that includes
+# the offload bundle that was just generated using an .incbin directive.
+# This needs an assembler.
+find_program(
+    LLVM_MC_COMMAND llvm-mc
+    PATH_SUFFIXES bin
+    PATHS
+    ${ROCM_ROOT}/llvm
+    ${CMAKE_INSTALL_PREFIX}/llvm)
+
+if(LLVM_MC_COMMAND)
+    message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}")
+else()
+    message(FATAL_ERROR "llvm-mc not found")
+endif()
+
+# Invoke llvm-mc to generate an object file containing the offload bundle.
+set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}")
+add_custom_command(
+    OUTPUT "${DEVICE_OBJECT}"
+    COMMAND
+        "${LLVM_MC_COMMAND}"
+        -triple "${HOST_TARGET}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}"
+        -o "${DEVICE_OBJECT}"
+        --filetype=obj
+    DEPENDS "${BUNDLE}"
+    VERBATIM)
+
+# Finally, create the executable.
+add_executable(
+    ${example_name}
+    main.hip
+    ${DEVICE_OBJECT})
+
+# Make example runnable using ctest.
+add_test(${example_name} ${example_name})
+
+set(include_dirs "../../Common")
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/assembly_to_executable/Makefile b/HIP-Basic/assembly_to_executable/Makefile
new file mode 100644
index 00000000..3d26a352
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/Makefile
@@ -0,0 +1,89 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+EXAMPLE := hip_assembly_to_executable
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME ?= HIP
+
+ifneq ($(GPU_RUNTIME), HIP)
+$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
+endif
+
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX 				  ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+CLANG                 ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang
+LLVM_MC               ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc
+CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+# Compile for these GPU architectures
+HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030
+
+# If white-space is given as a literal the `subst` cannot recognize it.
+# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS
+# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER
+# in the targets option. The targets option needs to be a single string with no spaces.
+empty =
+space = $(empty) $(empty)
+comma = ,
+
+GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES))
+GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%))
+
+all: $(EXAMPLE)
+
+$(EXAMPLE): main.o main_device.o
+	$(HIPCXX) -o $@ $^
+
+main_device.o: hip_obj_gen.mcin offload_bundle.hipfb
+	$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj
+
+offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o)
+	$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \
+		-targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \
+		-input=/dev/null \
+		$(^:%=-input=%) \
+		-output=$@
+
+main.o: main.hip
+	$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $<
+
+main_%.o: main_%.s
+	$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $<
+
+clean:
+	rm -f \
+		main_*.o \
+		offload_bundle.hipfb \
+		main_device.o \
+		main.o \
+		$(EXAMPLE)
+
+.PHONY: clean $(EXAMPLE)
diff --git a/HIP-Basic/assembly_to_executable/README.md b/HIP-Basic/assembly_to_executable/README.md
new file mode 100644
index 00000000..f7307df9
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/README.md
@@ -0,0 +1,115 @@
+# HIP-Basic Assembly to Executable Example
+
+## Description
+This example shows how to manually compile and link a HIP application from device assembly. Pre-generated assembly files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable.
+
+Building HIP executables from device assembly can be useful for example to experiment with specific instructions, perform specific optimizations, or can help debugging.
+
+### Building
+
+- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma.
+    ```shell
+    make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030"
+    ```
+- Build with CMake:
+    ```shell
+    cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030"
+    cmake --build build
+    ```
+    On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"`
+
+## Generating device assembly
+This example creates a HIP file from device assembly code, however, such assembly files can also be created from HIP source code using `hipcc`. This can be done by passing `-S` and `--cuda-device-only` to hipcc. The former flag instructs the compiler to generate human-readable assembly instead of machine code, and the latter instruct the compiler to only compile the device part of the program. The six assembly files for this example were generated as follows:
+```shell
+$ROCM_INSTALL_DIR/bin/hipcc -S --cuda-device-only --offload-arch=gfx803 --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 main.hip
+```
+
+The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=<arch>:<feature>+` to enable it or `--offload-arch=<arch>:<feature>-` to disable it. Multiple features may be present, separated by colons.
+
+## Build Process
+A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device assembly files and host HIP code according to the following process:
+
+1. The `main.hip` file is compiled to an object file that only contains host code with `hipcc` by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead.
+    ```shell
+    $ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip
+    ```
+
+2. Each device assembly file is compiled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`:
+
+    ```shell
+    $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.s -o main_gfx1030.o
+    $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=<arch> main_<arch>.s -o main_<arch>.o
+    ...
+    ```
+
+3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `<offload-kind>-<target>-<arch>`. For HIP device code, `<offload-kind>` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `<offload-kind>` `host`. The order of targets and inputs must match. `<target>` is an LLVM target triple, which is specified as `<isa>-<vendor>-<os>-<abi>`. `<abi>` is left empty for AMD targets.
+
+    ```shell
+    $ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \
+            -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \
+            -input=/dev/null \
+            -input=main_gfx1030.o -input=... \
+            -output=offload_bundle.hipfb
+    ```
+
+    Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling to assembly.
+
+4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive:
+    ```nasm
+        .type __hip_fatbin,@object
+        ; Tell the assembler to place the offload bundle in the appropriate section.
+        .section .hip_fatbin,"a",@progbits
+        ; Make the symbol that addresses the binary public
+        .globl __hip_fatbin
+        ; Give the bundle the required alignment
+        .p2align 12
+    __hip_fatbin:
+        ; Include the binary
+        .incbin "offload_bundle.hipfb"
+    ```
+    This file can then be assembled using `llvm-mc` as follows:
+    ```
+    $ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple <host target> -o main_device.o hip_obj_gen.mcin --filetype=obj
+    ```
+
+5. Finally, using the system linker, hipcc, or clang, the host object and device objects are linked into an executable:
+    ```shell
+    <ROCM_PATH>/hip/bin/hipcc -o hip_assembly_to_executable main.o main_device.o
+    ```
+
+### Visual Studio 2019
+The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools:
+- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`:
+    ```
+    Additional Options: --cuda-host-only
+    ```
+- Each device assembly .s file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section:
+    ```
+    Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a
+    Description: Compiling Device Assembly %(Identity)
+    Output: $(IntDir)%(FileName).o
+    Execute Before: ClCompile
+    ```
+- Steps 3 and 4 are implemented using a custom build step:
+    ```
+    Command Line:
+      "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+      cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    Description: Generating Device Offload Object
+    Outputs: $(IntDIr)main_device.obj
+    Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)
+    Execute Before: ClCompile
+    ```
+- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`:
+    ```
+    Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies)
+    ```
+
+## Used API surface
+### HIP runtime
+- `hipFree`
+- `hipGetDeviceProperties`
+- `hipGetLastError`
+- `hipLaunchKernelGGL`
+- `hipMalloc`
+- `hipMemcpy`
diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln
new file mode 100644
index 00000000..1ceb39e1
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {5EAD9B5F-41B6-452E-922F-D5782C75EB8F}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj
new file mode 100644
index 00000000..7783b217
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj
@@ -0,0 +1,183 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="hip_obj_gen_win.mcin">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx1030.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx803.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa  -mcpu=gfx803</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa  -mcpu=gfx803</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx900.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx906.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx908.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx90a.s">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{60b4ade0-8286-46ae-b884-5da51b541ded}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>assembly_to_executable_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+    <CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+    <CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CustomBuild>
+      <Message>Compiling Device Assembly %(Identity)</Message>
+      <Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
+      <Outputs>$(IntDir)%(FileName).o</Outputs>
+    </CustomBuild>
+    <CustomBuildStep>
+      <Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Message>Generating Device Offload Object</Message>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(IntDIr)main_device.obj</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CustomBuild>
+      <Message>Compiling Device Assembly %(Identity)</Message>
+      <Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
+      <Outputs>$(IntDir)%(FileName).o</Outputs>
+    </CustomBuild>
+    <CustomBuildStep>
+      <Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Message>Generating Device Offload Object</Message>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(IntDIr)main_device.obj</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters
new file mode 100644
index 00000000..205bad8d
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/assembly_to_executable_vs2019.vcxproj.filters
@@ -0,0 +1,53 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="main_gfx90a.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx803.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx900.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx906.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx908.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx90a.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx1030.s">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="hip_obj_gen_win.mcin">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+  </ItemGroup>
+</Project>
diff --git a/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin b/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin
new file mode 100644
index 00000000..6b9fee5f
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/hip_obj_gen.mcin
@@ -0,0 +1,21 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#       HIP Object Generator
+# Use this generator to create a host bundled object file
+# with the input of an offload bundled fat binary.
+#
+# Input: Bundled Object file .hipfb file
+# Output: Host Bundled Object File .o
+
+    .type __hip_fatbin,@object
+    # Tell the assembler to place the offload bundle in the appropriate section.
+    .section .hip_fatbin,"a",@progbits
+    # Make the symbol that addresses the binary public.
+    .globl __hip_fatbin
+    # Give the bundle the required alignment of 4096 (2 ^ 12).
+    .p2align 12
+__hip_fatbin:
+    # Include the offload bundle.
+    .incbin "offload_bundle.hipfb"
diff --git a/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin b/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin
new file mode 100644
index 00000000..3636354e
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/hip_obj_gen_win.mcin
@@ -0,0 +1,20 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#       HIP Object Generator
+# Use this generator to create a host bundled object file
+# with the input of an offload bundled fat binary.
+#
+# Input: Bundled Object file .hipfb file
+# Output: Host Bundled Object File .o
+
+    # Tell the assembler to place the offload bundle in the appropriate section.
+    .section .hip_fatbin,"dw"
+    # Make the symbol that addresses the binary public.
+    .globl __hip_fatbin
+    # Give the bundle the required alignment of 4096 (2 ^ 12).
+    .p2align 12
+__hip_fatbin:
+    # Include the offload bundle.
+    .incbin "offload_bundle.hipfb"
diff --git a/HIP-Basic/assembly_to_executable/main.hip b/HIP-Basic/assembly_to_executable/main.hip
new file mode 100644
index 00000000..588fc070
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main.hip
@@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+/// \brief Device function to square each element
+/// in the array `in` and write to array `out`.
+template<typename T>
+__global__ void vector_square_kernel(T* out, const T* in, const long long size)
+{
+    // Get the unique global thread ID
+    const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Each thread hops stride amount of elements to find the next
+    // element to square
+    const size_t stride = blockDim.x * gridDim.x;
+
+    for(size_t i = offset; i < size; i += stride)
+    {
+        out[i] = in[i] * in[i];
+    }
+}
+
+int main()
+{
+    // Set the problem size
+    constexpr size_t size          = 1000000;
+    constexpr size_t size_in_bytes = size * sizeof(float);
+
+    hipDeviceProp_t props;
+    HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/));
+    std::cout << "info: running on device " << props.name << "\n";
+
+    std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) "
+              << "\n";
+
+    // Declare the host side arrays
+    std::vector<float> h_in(size);
+    std::vector<float> h_out(size);
+
+    // Initialize the host size input
+    for(size_t i = 0; i < size; i++)
+    {
+        h_in[i] = 1.618f + i;
+    }
+
+    // Declare the device side arrays
+    float *d_in, *d_out;
+    std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n";
+    // Allocate the device side memory
+    HIP_CHECK(hipMalloc(&d_in, size_in_bytes));
+    HIP_CHECK(hipMalloc(&d_out, size_in_bytes));
+
+    std::cout << "info: copy Host2Device\n";
+
+    // Copy the input from host to the GPU device
+    HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice));
+
+    // Set the number of blocks per kernel grid.
+    constexpr unsigned int grid_size = 512;
+    // Set the number of threads per kernel block.
+    constexpr unsigned int threads_per_block = 256;
+
+    std::cout << "info: launch 'vector_square_kernel' kernel\n";
+    hipLaunchKernelGGL(vector_square_kernel,
+                       grid_size,
+                       threads_per_block,
+                       0,
+                       hipStreamDefault,
+                       d_out,
+                       d_in,
+                       size);
+
+    // Check that the kernel invocation was successful.
+    HIP_CHECK(hipGetLastError());
+
+    std::cout << "info: copy Device2Host\n";
+    HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost));
+
+    HIP_CHECK(hipFree(d_in));
+    HIP_CHECK(hipFree(d_out));
+
+    std::cout << "info: check result\n";
+    for(size_t i = 0; i < size; i++)
+    {
+        if(h_out[i] != h_in[i] * h_in[i])
+        {
+            std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i]
+                      << ", expected:  " << h_in[i] * h_in[i] << '\n';
+            exit(error_exit_code);
+        }
+    }
+    std::cout << "PASSED!\n";
+}
diff --git a/HIP-Basic/assembly_to_executable/main_gfx1030.s b/HIP-Basic/assembly_to_executable/main_gfx1030.s
new file mode 100644
index 00000000..ce4952af
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx1030.s
@@ -0,0 +1,219 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx1030"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[2:3], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_nc_u32_e32 v0, s8, v0
+	v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
+	s_and_saveexec_b32 s0, vcc_lo
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s8, s[4:5], 0xc
+	s_load_dwordx4 s[4:7], s[6:7], 0x0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b32 s9, 0
+	s_mov_b32 s1, s9
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[10:11], s[8:9], 2
+	.p2align	6
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_add_co_u32 v4, vcc_lo, s6, v2
+	v_add_co_ci_u32_e32 v5, vcc_lo, s7, v3, vcc_lo
+	v_add_co_u32 v0, vcc_lo, v0, s8
+	v_add_co_ci_u32_e32 v1, vcc_lo, s9, v1, vcc_lo
+	global_load_dword v6, v[4:5], off
+	v_add_co_u32 v4, vcc_lo, s4, v2
+	v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+	v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+	v_add_co_u32 v2, s0, v2, s10
+	v_add_co_ci_u32_e64 v3, s0, s11, v3, s0
+	s_or_b32 s1, vcc_lo, s1
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	global_store_dword v[4:5], v6, off
+	s_andn2_b32 exec_lo, exec_lo, s1
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_wavefront_size32 1
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 7
+		.amdhsa_next_free_sgpr 12
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 3
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_fp16_overflow 0
+		.amdhsa_workgroup_processor_mode 1
+		.amdhsa_memory_ordered 1
+		.amdhsa_forward_progress 0
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 188
+; NumSgprs: 14
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 14
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 16
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.text
+	.p2alignl 6, 3214868480
+	.fill 48, 4, 3214868480
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     14
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     7
+    .vgpr_spill_count: 0
+    .wavefront_size: 32
+amdhsa.target:   amdgcn-amd-amdhsa--gfx1030
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/assembly_to_executable/main_gfx803.s b/HIP-Basic/assembly_to_executable/main_gfx803.s
new file mode 100644
index 00000000..7f9c7f3f
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx803.s
@@ -0,0 +1,214 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx803"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[10:11], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_u32_e32 v0, vcc, s8, v0
+	v_cmp_gt_u64_e32 vcc, s[10:11], v[0:1]
+	s_and_saveexec_b64 s[0:1], vcc
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s8, s[4:5], 0xc
+	s_load_dwordx4 s[4:7], s[6:7], 0x0
+	s_mov_b32 s9, 0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b64 s[14:15], 0
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[12:13], s[8:9], 2
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_mov_b32_e32 v5, s7
+	v_add_u32_e32 v4, vcc, s6, v2
+	v_addc_u32_e32 v5, vcc, v5, v3, vcc
+	flat_load_dword v6, v[4:5]
+	v_mov_b32_e32 v5, s5
+	v_mov_b32_e32 v7, s9
+	v_add_u32_e32 v0, vcc, s8, v0
+	v_mov_b32_e32 v8, s13
+	v_add_u32_e64 v4, s[0:1], s4, v2
+	v_add_u32_e64 v2, s[2:3], s12, v2
+	v_addc_u32_e64 v5, s[0:1], v5, v3, s[0:1]
+	v_addc_u32_e32 v1, vcc, v1, v7, vcc
+	v_addc_u32_e64 v3, vcc, v3, v8, s[2:3]
+	v_cmp_le_u64_e32 vcc, s[10:11], v[0:1]
+	s_or_b64 s[14:15], vcc, s[14:15]
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	flat_store_dword v[4:5], v6
+	s_andn2_b64 exec, exec, s[14:15]
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 9
+		.amdhsa_next_free_sgpr 16
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 0
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 200
+; NumSgprs: 18
+; NumVgprs: 9
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 192
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 2
+; NumSGPRsForWavesPerEU: 18
+; NumVGPRsForWavesPerEU: 9
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     18
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     9
+    .vgpr_spill_count: 0
+    .wavefront_size: 64
+amdhsa.target:   amdgcn-amd-amdhsa--gfx803
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/assembly_to_executable/main_gfx900.s b/HIP-Basic/assembly_to_executable/main_gfx900.s
new file mode 100644
index 00000000..6ca519c1
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx900.s
@@ -0,0 +1,216 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[12:13], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_u32_e32 v0, s8, v0
+	v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
+	s_and_saveexec_b64 s[0:1], vcc
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s14, s[4:5], 0xc
+	s_load_dwordx4 s[8:11], s[6:7], 0x0
+	s_mov_b32 s15, 0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b64 s[6:7], 0
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[4:5], s[14:15], 2
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_mov_b32_e32 v5, s11
+	v_add_co_u32_e32 v4, vcc, s10, v2
+	v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
+	global_load_dword v6, v[4:5], off
+	v_mov_b32_e32 v5, s9
+	v_mov_b32_e32 v7, s15
+	v_add_co_u32_e32 v0, vcc, s14, v0
+	v_mov_b32_e32 v8, s5
+	v_add_co_u32_e64 v4, s[0:1], s8, v2
+	v_add_co_u32_e64 v2, s[2:3], s4, v2
+	v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1]
+	v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+	v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3]
+	v_cmp_le_u64_e32 vcc, s[12:13], v[0:1]
+	s_or_b64 s[6:7], vcc, s[6:7]
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	global_store_dword v[4:5], v6, off
+	s_andn2_b64 exec, exec, s[6:7]
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 9
+		.amdhsa_next_free_sgpr 16
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_reserve_xnack_mask 1
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 3
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_fp16_overflow 0
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 200
+; NumSgprs: 18
+; NumVgprs: 9
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 2
+; NumSGPRsForWavesPerEU: 18
+; NumVGPRsForWavesPerEU: 9
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     18
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     9
+    .vgpr_spill_count: 0
+    .wavefront_size: 64
+amdhsa.target:   amdgcn-amd-amdhsa--gfx900
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/assembly_to_executable/main_gfx906.s b/HIP-Basic/assembly_to_executable/main_gfx906.s
new file mode 100644
index 00000000..2447c87b
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx906.s
@@ -0,0 +1,216 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[12:13], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_u32_e32 v0, s8, v0
+	v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
+	s_and_saveexec_b64 s[0:1], vcc
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s14, s[4:5], 0xc
+	s_load_dwordx4 s[8:11], s[6:7], 0x0
+	s_mov_b32 s15, 0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b64 s[6:7], 0
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[4:5], s[14:15], 2
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_mov_b32_e32 v5, s11
+	v_add_co_u32_e32 v4, vcc, s10, v2
+	v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
+	global_load_dword v6, v[4:5], off
+	v_mov_b32_e32 v5, s9
+	v_mov_b32_e32 v7, s15
+	v_add_co_u32_e32 v0, vcc, s14, v0
+	v_mov_b32_e32 v8, s5
+	v_add_co_u32_e64 v4, s[0:1], s8, v2
+	v_add_co_u32_e64 v2, s[2:3], s4, v2
+	v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1]
+	v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+	v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3]
+	v_cmp_le_u64_e32 vcc, s[12:13], v[0:1]
+	s_or_b64 s[6:7], vcc, s[6:7]
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	global_store_dword v[4:5], v6, off
+	s_andn2_b64 exec, exec, s[6:7]
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 9
+		.amdhsa_next_free_sgpr 16
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_reserve_xnack_mask 1
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 3
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_fp16_overflow 0
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 200
+; NumSgprs: 18
+; NumVgprs: 9
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 2
+; NumSGPRsForWavesPerEU: 18
+; NumVGPRsForWavesPerEU: 9
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     18
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     9
+    .vgpr_spill_count: 0
+    .wavefront_size: 64
+amdhsa.target:   amdgcn-amd-amdhsa--gfx906
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/assembly_to_executable/main_gfx908.s b/HIP-Basic/assembly_to_executable/main_gfx908.s
new file mode 100644
index 00000000..851f0a89
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx908.s
@@ -0,0 +1,218 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx908"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[12:13], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_u32_e32 v0, s8, v0
+	v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
+	s_and_saveexec_b64 s[0:1], vcc
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s14, s[4:5], 0xc
+	s_load_dwordx4 s[8:11], s[6:7], 0x0
+	s_mov_b32 s15, 0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b64 s[6:7], 0
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[4:5], s[14:15], 2
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_mov_b32_e32 v5, s11
+	v_add_co_u32_e32 v4, vcc, s10, v2
+	v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
+	global_load_dword v6, v[4:5], off
+	v_mov_b32_e32 v5, s9
+	v_mov_b32_e32 v7, s15
+	v_add_co_u32_e32 v0, vcc, s14, v0
+	v_mov_b32_e32 v8, s5
+	v_add_co_u32_e64 v4, s[0:1], s8, v2
+	v_add_co_u32_e64 v2, s[2:3], s4, v2
+	v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1]
+	v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+	v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3]
+	v_cmp_le_u64_e32 vcc, s[12:13], v[0:1]
+	s_or_b64 s[6:7], vcc, s[6:7]
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	global_store_dword v[4:5], v6, off
+	s_andn2_b64 exec, exec, s[6:7]
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 9
+		.amdhsa_next_free_sgpr 16
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_reserve_xnack_mask 1
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 3
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_fp16_overflow 0
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 200
+; NumSgprs: 18
+; NumVgprs: 9
+; NumAgprs: 0
+; TotalNumVgprs: 9
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 2
+; NumSGPRsForWavesPerEU: 18
+; NumVGPRsForWavesPerEU: 9
+; Occupancy: 10
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     18
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     9
+    .vgpr_spill_count: 0
+    .wavefront_size: 64
+amdhsa.target:   amdgcn-amd-amdhsa--gfx908
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/assembly_to_executable/main_gfx90a.s b/HIP-Basic/assembly_to_executable/main_gfx90a.s
new file mode 100644
index 00000000..85575938
--- /dev/null
+++ b/HIP-Basic/assembly_to_executable/main_gfx90a.s
@@ -0,0 +1,226 @@
+	.text
+	.amdgcn_target "amdgcn-amd-amdhsa--gfx90a"
+	.protected	_Z20vector_square_kernelIfEvPT_PKS0_x ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_x
+	.globl	_Z20vector_square_kernelIfEvPT_PKS0_x
+	.p2align	8
+	.type	_Z20vector_square_kernelIfEvPT_PKS0_x,@function
+_Z20vector_square_kernelIfEvPT_PKS0_x:  ; @_Z20vector_square_kernelIfEvPT_PKS0_x
+; %bb.0:
+	s_load_dword s0, s[4:5], 0x4
+	s_load_dwordx2 s[12:13], s[6:7], 0x10
+	v_mov_b32_e32 v1, 0
+	s_waitcnt lgkmcnt(0)
+	s_and_b32 s0, s0, 0xffff
+	s_mul_i32 s8, s8, s0
+	v_add_u32_e32 v0, s8, v0
+	v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
+	s_and_saveexec_b64 s[0:1], vcc
+	s_cbranch_execz BB0_3
+; %bb.1:
+	s_load_dword s14, s[4:5], 0xc
+	s_load_dwordx4 s[8:11], s[6:7], 0x0
+	s_mov_b32 s15, 0
+	v_lshlrev_b64 v[2:3], 2, v[0:1]
+	s_mov_b64 s[6:7], 0
+	s_waitcnt lgkmcnt(0)
+	s_lshl_b64 s[4:5], s[14:15], 2
+BB0_2:                                  ; =>This Inner Loop Header: Depth=1
+	v_mov_b32_e32 v5, s11
+	v_add_co_u32_e32 v4, vcc, s10, v2
+	v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
+	global_load_dword v6, v[4:5], off
+	v_mov_b32_e32 v5, s9
+	v_mov_b32_e32 v7, s15
+	v_add_co_u32_e32 v0, vcc, s14, v0
+	v_mov_b32_e32 v8, s5
+	v_add_co_u32_e64 v4, s[0:1], s8, v2
+	v_add_co_u32_e64 v2, s[2:3], s4, v2
+	v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1]
+	v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+	v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3]
+	v_cmp_le_u64_e32 vcc, s[12:13], v[0:1]
+	s_or_b64 s[6:7], vcc, s[6:7]
+	s_waitcnt vmcnt(0)
+	v_mul_f32_e32 v6, v6, v6
+	global_store_dword v[4:5], v6, off
+	s_andn2_b64 exec, exec, s[6:7]
+	s_cbranch_execnz BB0_2
+BB0_3:
+	s_endpgm
+	.section	.rodata,#alloc
+	.p2align	6
+	.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_x
+		.amdhsa_group_segment_fixed_size 0
+		.amdhsa_private_segment_fixed_size 0
+		.amdhsa_kernarg_size 80
+		.amdhsa_user_sgpr_private_segment_buffer 1
+		.amdhsa_user_sgpr_dispatch_ptr 1
+		.amdhsa_user_sgpr_queue_ptr 0
+		.amdhsa_user_sgpr_kernarg_segment_ptr 1
+		.amdhsa_user_sgpr_dispatch_id 0
+		.amdhsa_user_sgpr_flat_scratch_init 0
+		.amdhsa_user_sgpr_private_segment_size 0
+		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
+		.amdhsa_system_sgpr_workgroup_id_x 1
+		.amdhsa_system_sgpr_workgroup_id_y 0
+		.amdhsa_system_sgpr_workgroup_id_z 0
+		.amdhsa_system_sgpr_workgroup_info 0
+		.amdhsa_system_vgpr_workitem_id 0
+		.amdhsa_next_free_vgpr 9
+		.amdhsa_next_free_sgpr 16
+		.amdhsa_accum_offset 12
+		.amdhsa_reserve_flat_scratch 0
+		.amdhsa_reserve_xnack_mask 1
+		.amdhsa_float_round_mode_32 0
+		.amdhsa_float_round_mode_16_64 0
+		.amdhsa_float_denorm_mode_32 3
+		.amdhsa_float_denorm_mode_16_64 3
+		.amdhsa_dx10_clamp 1
+		.amdhsa_ieee_mode 1
+		.amdhsa_fp16_overflow 0
+		.amdhsa_tg_split 0
+		.amdhsa_exception_fp_ieee_invalid_op 0
+		.amdhsa_exception_fp_denorm_src 0
+		.amdhsa_exception_fp_ieee_div_zero 0
+		.amdhsa_exception_fp_ieee_overflow 0
+		.amdhsa_exception_fp_ieee_underflow 0
+		.amdhsa_exception_fp_ieee_inexact 0
+		.amdhsa_exception_int_div_zero 0
+	.end_amdhsa_kernel
+	.text
+.Lfunc_end0:
+	.size	_Z20vector_square_kernelIfEvPT_PKS0_x, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_x
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 200
+; NumSgprs: 18
+; NumVgprs: 9
+; NumAgprs: 0
+; TotalNumVgprs: 9
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 2
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 18
+; NumVGPRsForWavesPerEU: 9
+; AccumOffset: 12
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 8
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
+; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
+	.text
+	.p2alignl 6, 3212836864
+	.fill 256, 4, 3212836864
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
+
+	.protected	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.type	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
+	.section	.rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
+	.weak	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
+	.zero	1
+	.size	_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
+
+	.ident	"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"
+	.section	".note.GNU-stack"
+	.addrsig
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
+	.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
+	.amdgpu_metadata
+---
+amdhsa.kernels:
+  - .args:
+      - .address_space:  global
+        .offset:         0
+        .size:           8
+        .value_kind:     global_buffer
+      - .address_space:  global
+        .offset:         8
+        .size:           8
+        .value_kind:     global_buffer
+      - .offset:         16
+        .size:           8
+        .value_kind:     by_value
+      - .offset:         24
+        .size:           8
+        .value_kind:     hidden_global_offset_x
+      - .offset:         32
+        .size:           8
+        .value_kind:     hidden_global_offset_y
+      - .offset:         40
+        .size:           8
+        .value_kind:     hidden_global_offset_z
+      - .address_space:  global
+        .offset:         48
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         56
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         64
+        .size:           8
+        .value_kind:     hidden_none
+      - .address_space:  global
+        .offset:         72
+        .size:           8
+        .value_kind:     hidden_multigrid_sync_arg
+    .group_segment_fixed_size: 0
+    .kernarg_segment_align: 8
+    .kernarg_segment_size: 80
+    .language:       OpenCL C
+    .language_version:
+      - 2
+      - 0
+    .max_flat_workgroup_size: 1024
+    .name:           _Z20vector_square_kernelIfEvPT_PKS0_x
+    .private_segment_fixed_size: 0
+    .sgpr_count:     18
+    .sgpr_spill_count: 0
+    .symbol:         _Z20vector_square_kernelIfEvPT_PKS0_x.kd
+    .vgpr_count:     9
+    .vgpr_spill_count: 0
+    .wavefront_size: 64
+amdhsa.target:   amdgcn-amd-amdhsa--gfx90a
+amdhsa.version:
+  - 1
+  - 1
+...
+
+	.end_amdgpu_metadata
diff --git a/HIP-Basic/bandwidth/.gitignore b/HIP-Basic/bandwidth/.gitignore
new file mode 100644
index 00000000..d69da8d5
--- /dev/null
+++ b/HIP-Basic/bandwidth/.gitignore
@@ -0,0 +1 @@
+hip_bandwidth
diff --git a/HIP-Basic/bandwidth/CMakeLists.txt b/HIP-Basic/bandwidth/CMakeLists.txt
new file mode 100644
index 00000000..3d319b43
--- /dev/null
+++ b/HIP-Basic/bandwidth/CMakeLists.txt
@@ -0,0 +1,56 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_bandwidth)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+set(GPU_RUNTIMES "HIP" "CUDA")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
+
+if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest
+add_test(${example_name} ${example_name})
+set(include_dirs "../../Common")
+if(GPU_RUNTIME STREQUAL "CUDA")
+    list(APPEND include_dirs "${ROCM_ROOT}/include")
+endif()
+
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/bandwidth/Makefile b/HIP-Basic/bandwidth/Makefile
new file mode 100644
index 00000000..6c821f21
--- /dev/null
+++ b/HIP-Basic/bandwidth/Makefile
@@ -0,0 +1,54 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := hip_bandwidth
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME := HIP
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+	CXXFLAGS += -x cu
+	CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+else ifeq ($(GPU_RUNTIME), HIP)
+else
+$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
+	$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/HIP-Basic/bandwidth/README.md b/HIP-Basic/bandwidth/README.md
new file mode 100644
index 00000000..31bbba35
--- /dev/null
+++ b/HIP-Basic/bandwidth/README.md
@@ -0,0 +1,28 @@
+# Cookbook Bandwidth Example
+
+## Description
+This example measures the memory bandwith capacity of GPU devices. It performs memcpy from host to GPU device, GPU device to host, and within a single GPU.
+
+### Application flow 
+1. User commandline arguments are parsed and test parameters initialized. If there are no commandline arguments then the test paramenters are initialized with default values.
+2. Bandwidth tests are launched.
+3. If the memory type for the test set to `-memory pageable` then the host side data is instantiated in `std::vector<unsigned char>`. If the memory type for the test set to `-memory pinned` then the host side data is instantiated in `unsigned char*` and allocated using `hipHostMalloc`.
+4. Device side storage is allocated using `hipMalloc` in `unsigned char*`
+5. Memory transfer is performed `trail` amount of times using `hipMemcpy` for pageable memory or using `hipMemcpyAsync` for host allocated pinned memory.
+6. Time of memory transfer operations is measured that is then used to calculate the bandwidth.
+9. All device memory is freed using `hipFree` and all host allocated pinned memory is freed using `hipHostFree`.
+
+## Key APIs and Concepts
+The program uses HIP pageable and pinned memory. It is important to note that the pinned memory is allocated using `hipHostMalloc` and is destroyed using `hipHostFree`. The HIP memory transfer routine `hipMemcpyAsync` will behave synchronously if the host memory is not pinned. Therefore, it is important to allocate pinned host memory using `hipHostMalloc` for `hipMemcpyAsync` to behave asynchronously.
+
+## Demonstrated API Calls
+### HIP runtime
+- `hipMalloc`
+- `hipMemcpy`
+- `hipMemcpyAsync`
+- `hipGetDeviceCount`
+- `hipGetDeviceProperties`
+- `hipFree`
+- `hipHostFree`
+- `hipHostMalloc`
+- `hipSetDevice`
diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.sln b/HIP-Basic/bandwidth/bandwidth_vs2019.sln
new file mode 100644
index 00000000..09016afe
--- /dev/null
+++ b/HIP-Basic/bandwidth/bandwidth_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {1E2ACB7F-1706-491A-9E62-395C1BD8E637}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj
new file mode 100644
index 00000000..3283ff1b
--- /dev/null
+++ b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\cmdparser.hpp" />
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{16b11b54-cd72-43b6-b226-38c668b41a79}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>bandwidth_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters
new file mode 100644
index 00000000..7dc35f68
--- /dev/null
+++ b/HIP-Basic/bandwidth/bandwidth_vs2019.vcxproj.filters
@@ -0,0 +1,30 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{c71d9db2-bf13-49ee-b794-626d24391150}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{520f4985-c9bd-4add-9485-049fafe0cdca}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{006f799a-d711-49a7-93da-7f60d8872b02}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\cmdparser.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/HIP-Basic/bandwidth/main.hip b/HIP-Basic/bandwidth/main.hip
new file mode 100644
index 00000000..56d127ae
--- /dev/null
+++ b/HIP-Basic/bandwidth/main.hip
@@ -0,0 +1,637 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "cmdparser.hpp"
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <vector>
+
+// Paged or pinned host memory
+enum class MemoryMode : unsigned int
+{
+    PAGED,
+    PINNED
+};
+
+// Test either ranges of inputs sizes with a constant increament
+// or a more complex shmoo test that tests bandwidth for large number of varying sizes.
+enum class TestMode : unsigned int
+{
+    RANGED,
+    SHMOO
+};
+
+/// \brief Run host to device or device to host transfer, bandwidth calculated for the specified configuration
+std::vector<double>
+    run_bandwidth_host_device(const std::vector<unsigned long>& memory_copy_measurement_sizes,
+                              const int                         device,
+                              hipMemcpyKind                     hip_memcpy_kind,
+                              const MemoryMode                  memory_mode,
+                              const unsigned int                trails)
+{
+
+    // Check for invalid configurations
+    if(hip_memcpy_kind == hipMemcpyDeviceToDevice)
+    {
+        std::cerr << "hipMemcpyDeviceToDevice is an invalid Configuration\n";
+        exit(error_exit_code);
+    }
+
+    // The bandwidths calculated will be stored in bandwidth_measurements
+    std::vector<double> bandwidth_measurements;
+
+    // Flush buffer for CPU cache
+    constexpr size_t  flush_size = 256 * 1024 * 1024;
+    std::vector<char> flush_buffer(flush_size);
+
+    HIP_CHECK(hipSetDevice(device));
+
+    if(hip_memcpy_kind == hipMemcpyHostToDevice)
+    {
+        std::cout << "Measuring Host to Device Bandwidth: " << std::flush;
+    }
+    else
+    {
+        std::cout << "Measuring Device to Host Bandwidth: " << std::flush;
+    }
+
+    for(auto size : memory_copy_measurement_sizes)
+    {
+        std::cout << "[" << size << "] " << std::flush;
+
+        // Blocks used to clear host cache
+        const unsigned long long   cache_clear_size = 1 << 24;
+        std::vector<unsigned char> h_cache_block_1(cache_clear_size);
+        std::vector<unsigned char> h_cache_block_2(cache_clear_size);
+
+        // Size in bytes
+        const size_t size_in_bytes = sizeof(unsigned char) * size;
+
+        // Allocate device input memory
+        unsigned char* d_in = nullptr;
+        HIP_CHECK(hipMalloc(&d_in, size_in_bytes));
+
+        // Memory transfer from host to device
+        if(memory_mode == MemoryMode::PAGED)
+        {
+            // Host input memory
+            std::vector<unsigned char> h_in(size);
+
+            // Host output memory
+            std::vector<unsigned char> h_out(size);
+
+            // Initialize the host input memory
+            for(unsigned int i = 0; i < size; i++)
+            {
+                h_in[i] = static_cast<unsigned char>(i & 0xff);
+            }
+
+            unsigned char* src = nullptr;
+            unsigned char* dst = nullptr;
+
+            switch(hip_memcpy_kind)
+            {
+                case hipMemcpyHostToDevice:
+                    // Set the source and destination for hipMemcpy
+                    src = h_in.data();
+                    dst = d_in;
+                    break;
+                case hipMemcpyDeviceToHost:
+                    // Transfer the host input to device
+                    HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice));
+
+                    // Set the source and destination for hipMemcpy
+                    src = d_in;
+                    dst = h_out.data();
+                    break;
+                default:
+                    std::cerr << "Invalid memcpy kind " << hip_memcpy_kind << "! \n";
+                    exit(error_exit_code);
+            }
+
+            // Fill the host cache clear buffers
+            for(unsigned int i = 0; i < h_cache_block_1.size(); i++)
+            {
+                h_cache_block_1[i] = static_cast<unsigned char>(i & 0xff);
+                h_cache_block_2[i] = static_cast<unsigned char>(0xff - (i & 0xff));
+            }
+
+            // Timer class
+            HostClock host_clock;
+
+            // Perform memory transfers warm up
+            for(unsigned int i = 0; i < 5; i++)
+            {
+                // Initiate the memory transfer
+                HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind));
+
+                // Flush the buffer
+                memset(flush_buffer.data(), i, flush_buffer.size());
+            }
+
+            // Perform memory transfers for trails number of times
+            for(unsigned int i = 0; i < trails; i++)
+            {
+                host_clock.start_timer();
+
+                // Initiate the memory transfer
+                HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hip_memcpy_kind));
+
+                host_clock.stop_timer();
+
+                // Flush the buffer
+                memset(flush_buffer.data(), i, flush_buffer.size());
+            }
+            // Calculate the bandwith in GB/s
+            const double bandwidth_achieved
+                = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time();
+
+            bandwidth_measurements.emplace_back(bandwidth_achieved);
+        }
+        else if(memory_mode == MemoryMode::PINNED) // Pinned memory mode
+        {
+            // Host input memory
+            unsigned char* h_in = nullptr;
+
+            // Host output memory
+            unsigned char* h_out = nullptr;
+
+            HIP_CHECK(hipHostMalloc(&h_in, size_in_bytes));
+            HIP_CHECK(hipHostMalloc(&h_out, size_in_bytes));
+
+            // Initialize the host memory
+            for(unsigned int i = 0; i < size; i++)
+            {
+                h_in[i] = static_cast<unsigned char>(i & 0xff);
+            }
+
+            unsigned char* src = nullptr;
+            unsigned char* dst = nullptr;
+
+            if(hip_memcpy_kind == hipMemcpyHostToDevice)
+            {
+                // Set the source and destination for hipMemcpy
+                src = h_in;
+                dst = d_in;
+            }
+            else if(hip_memcpy_kind == hipMemcpyDeviceToHost)
+            {
+                // Transfer the host input to device
+                HIP_CHECK(hipMemcpyAsync(d_in, h_in, size_in_bytes, hip_memcpy_kind));
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Set the source and destination for hipMemcpy
+                src = d_in;
+                dst = h_out;
+            }
+
+            // Perform memory transfers warm up
+            for(unsigned int i = 0; i < 5; i++)
+            {
+                HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind));
+            }
+            HIP_CHECK(hipDeviceSynchronize());
+
+            HostClock host_clock;
+            host_clock.start_timer();
+
+            // Initiate the memory transfer
+            // Perform memory transfers for trails number of times
+            for(unsigned int i = 0; i < trails; i++)
+            {
+                HIP_CHECK(hipMemcpyAsync(dst, src, size_in_bytes, hip_memcpy_kind));
+            }
+
+            HIP_CHECK(hipDeviceSynchronize());
+
+            host_clock.stop_timer();
+
+            // Calculate the bandwith in GB/s
+            const double bandwidth_achieved
+                = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time();
+
+            bandwidth_measurements.emplace_back(bandwidth_achieved);
+
+            HIP_CHECK(hipHostFree(h_in));
+            HIP_CHECK(hipHostFree(h_out));
+        }
+
+        // Free the memory
+        HIP_CHECK(hipFree(d_in));
+    }
+    std::cout << std::endl;
+
+    return bandwidth_measurements;
+}
+
+/// \brief Run device to device transfer, bandwidth calculated for the specified configuration
+std::vector<double>
+    run_bandwidth_device_device(std::vector<unsigned long> memory_copy_measurement_sizes,
+                                const int                  device,
+                                const unsigned int         trails)
+{
+
+    // The bandwidths calculated will be stored in bandwidth_measurements
+    std::vector<double> bandwidth_measurements;
+
+    HIP_CHECK(hipSetDevice(device));
+
+    std::cout << "Measuring Device to Device Bandwith: " << std::flush;
+    for(auto size : memory_copy_measurement_sizes)
+    {
+        std::cout << "[" << size << "] " << std::flush;
+
+        // Size in bytes
+        const size_t size_in_bytes = sizeof(unsigned char) * size;
+
+        // Allocate device input memory
+        unsigned char* d_in = nullptr;
+        HIP_CHECK(hipMalloc(&d_in, size_in_bytes));
+
+        // Device output memory.
+        unsigned char* d_out = nullptr;
+        HIP_CHECK(hipMalloc(&d_out, size_in_bytes));
+
+        // Host input memory
+        std::vector<unsigned char> h_in(size);
+
+        // Initialize the host input memory
+        for(unsigned int i = 0; i < size; i++)
+        {
+            h_in[i] = static_cast<unsigned char>(i & 0xff);
+        }
+
+        // Transfer the host input to device
+        HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice));
+
+        // Set the source and destination for hipMemcpy
+        unsigned char* src = d_in;
+        unsigned char* dst = d_out;
+
+        // Perform memory transfers warm up
+        for(unsigned int i = 0; i < 5; i++)
+        {
+            // Initiate the memory transfer
+            HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice));
+        }
+
+        // Synchronize because the device to device memory copy is non-blocking
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Timer class
+        HostClock host_clock;
+        host_clock.start_timer();
+
+        // Perform memory transfers for trails number of times
+        for(unsigned int i = 0; i < trails; i++)
+        {
+            // Initiate the memory transfer
+            HIP_CHECK(hipMemcpy(dst, src, size_in_bytes, hipMemcpyDeviceToDevice));
+        }
+        HIP_CHECK(hipDeviceSynchronize());
+
+        host_clock.stop_timer();
+
+        // Calculate the bandwith in GB/s
+        const double bandwidth_achieved
+            = ((size_in_bytes * trails) / 1e9) / host_clock.get_elapsed_time();
+
+        bandwidth_measurements.emplace_back(bandwidth_achieved);
+
+        // Free the device output memory
+        HIP_CHECK(hipFree(d_out));
+
+        // Free the memory
+        HIP_CHECK(hipFree(d_in));
+    }
+    std::cout << std::endl;
+
+    return bandwidth_measurements;
+}
+
+std::vector<unsigned long>
+    generate_measurement_sizes_range(const size_t start_measurement,
+                                     const size_t end_measurement,
+                                     const size_t stride_between_measurements)
+{
+    // The size of data to copy for each measurement
+    std::vector<unsigned long> memory_copy_measurement_sizes;
+
+    for(size_t i = start_measurement; i < end_measurement; i += stride_between_measurements)
+    {
+        memory_copy_measurement_sizes.emplace_back(i);
+    }
+
+    return memory_copy_measurement_sizes;
+}
+
+std::vector<unsigned long> generate_measurement_sizes_shmoo()
+{
+
+    // Constants for shmoo mode
+    const size_t shmoo_memsize_max = 1 << 26; // 64 MB
+
+    const size_t shmoo_increment_1KB   = 1 << 10; // 1 KB
+    const size_t shmoo_increment_2KB   = 1 << 11; // 2 KB
+    const size_t shmoo_increment_10KB  = shmoo_increment_1KB * 10; // 10KB
+    const size_t shmoo_increment_100KB = shmoo_increment_10KB * 10; // 100 KB
+    const size_t shmoo_increment_1MB   = 1 << 20; // 1 MB
+    const size_t shmoo_increment_2MB   = 1 << 21; // 2 MB
+    const size_t shmoo_increment_4MB   = 1 << 22; // 4 MB
+
+    const size_t shmoo_limit_20KB  = shmoo_increment_10KB * 2; // 20 KB
+    const size_t shmoo_limit_50KB  = shmoo_increment_10KB * 5; // 50 KB
+    const size_t shmoo_limit_100KB = shmoo_increment_10KB * 10; // 100 KB
+    const size_t shmoo_limit_1MB   = 1 << 20; // 1 MB
+    const size_t shmoo_limit_16MB  = 1 << 24; // 16 MB
+    const size_t shmoo_limit_32MB  = 1 << 25; // 32 MB
+
+    // The size of data to copy for each measurement
+    std::vector<unsigned long> memory_copy_measurement_sizes;
+
+    size_t current_size = 0;
+
+    while(current_size <= shmoo_memsize_max)
+    {
+        if(current_size < shmoo_limit_20KB)
+        {
+            current_size += shmoo_increment_1KB;
+        }
+        else if(current_size < shmoo_limit_50KB)
+        {
+            current_size += shmoo_increment_2KB;
+        }
+        else if(current_size < shmoo_limit_100KB)
+        {
+            current_size += shmoo_increment_10KB;
+        }
+        else if(current_size < shmoo_limit_1MB)
+        {
+            current_size += shmoo_increment_100KB;
+        }
+        else if(current_size < shmoo_limit_16MB)
+        {
+            current_size += shmoo_increment_1MB;
+        }
+        else if(current_size < shmoo_limit_32MB)
+        {
+            current_size += shmoo_increment_2MB;
+        }
+        else
+        {
+            current_size += shmoo_increment_4MB;
+        }
+        memory_copy_measurement_sizes.emplace_back(current_size);
+    }
+
+    return memory_copy_measurement_sizes;
+}
+
+void configure_parser(cli::Parser& parser)
+{
+    // Default parameters
+    parser.set_optional<size_t>("start", "start", 1 << 20, "Starting size"); // Default 1 MB
+    parser.set_optional<size_t>("end", "end", 1 << 23, "Ending size"); // Default 8 MB
+    parser.set_optional<size_t>("stride",
+                                "stride",
+                                1 << 22, // Default 4 MB
+                                "Stride (or increament) between sizes");
+
+    parser.set_optional<std::string>("mode",
+                                     "mode",
+                                     "range",
+                                     "Mode of bandwidth test: range or shmoo");
+    parser.set_optional<std::string>("memory",
+                                     "memory",
+                                     "pageable",
+                                     "Memory allocation kind: pageable or pinned\n");
+    parser.set_optional<size_t>("trials", "trials", 50, "Number of trials");
+    parser.set_optional<std::vector<std::string>>(
+        "device",
+        "device",
+        {"0"},
+        "Space-separated list of devices\n"
+        "\tall for using all the available devices\n"
+        "\t0,1,2,...,n for using any particular available devices");
+    parser.set_optional<std::vector<std::string>>("memcpy",
+                                                  "memcpy",
+                                                  {"htod", "dtoh", "dtod"},
+                                                  "Space-separated list of memory copy kind.\n"
+                                                  "\thtod is host to device\n"
+                                                  "\tdtoh is device to host\n"
+                                                  "\tdtod is device to device");
+}
+
+int main(int argc, char** argv)
+{
+
+    // Get the number of hip devices in the system
+    int number_of_devices = 0;
+    HIP_CHECK(hipGetDeviceCount(&number_of_devices))
+
+    if(number_of_devices <= 0)
+    {
+        std::cerr << "HIP supported devices not found!"
+                  << "\n";
+        exit(error_exit_code);
+    }
+
+    // Parse user inputs
+    cli::Parser parser(argc, argv);
+    configure_parser(parser);
+    parser.run_and_exit_if_error();
+
+    // Set configurations for testing bandwidth
+    const size_t                   trials                      = parser.get<size_t>("trials");
+    const size_t                   start_measurement           = parser.get<size_t>("start");
+    const size_t                   end_measurement             = parser.get<size_t>("end");
+    const size_t                   stride_between_measurements = parser.get<size_t>("stride");
+    const std::string              mode                        = parser.get<std::string>("mode");
+    const std::string              memory_cmd                  = parser.get<std::string>("memory");
+    const std::vector<std::string> devices_cmd = parser.get<std::vector<std::string>>("device");
+    const std::vector<std::string> memcpy_cmd  = parser.get<std::vector<std::string>>("memcpy");
+
+    // Set the mode of bandwidth test: RANGED or SHMOO
+    TestMode mode_of_test;
+
+    if(mode == "range")
+    {
+        mode_of_test = TestMode::RANGED;
+    }
+    else if(mode == "shmoo")
+    {
+        mode_of_test = TestMode::SHMOO;
+    }
+    else
+    {
+        std::cerr << "Invalid mode " << mode << "! \n";
+        exit(error_exit_code);
+    }
+
+    // Set the memory host allocation type: PAGED or PINNED
+    MemoryMode memory_allocation;
+    if(memory_cmd == "pageable")
+    {
+        memory_allocation = MemoryMode::PAGED;
+    }
+    else if(memory_cmd == "pinned")
+    {
+        memory_allocation = MemoryMode::PINNED;
+    }
+    else
+    {
+        std::cerr << "Invalid memory allocation " << memory_cmd << "! \n";
+        exit(error_exit_code);
+    }
+
+    // Store device ids
+    std::vector<int> devices;
+    if(std::find(devices_cmd.begin(), devices_cmd.end(), "all") != devices_cmd.end())
+    {
+        devices = std::vector<int>(number_of_devices);
+
+        // Initialize the default device ids
+        std::iota(devices.begin(), devices.end(), 0);
+    }
+    else
+    {
+        for(const std::string& device : devices_cmd)
+        {
+            int device_id;
+            if(!parse_int_string(device, device_id))
+            {
+                std::cerr << "Invalid device ID " << device << "!\n";
+                exit(error_exit_code);
+            }
+
+            if(device_id < 0 || device_id >= number_of_devices)
+            {
+                std::cerr << "Invalid device id " << device << "!\n"
+                          << "Device does not exist\n";
+                exit(error_exit_code);
+            }
+            devices.emplace_back(device_id);
+        }
+    }
+
+    std::cout << "Devices: " << format_range(devices.begin(), devices.end()) << "\n";
+
+    // Set hipMemcpyKind
+    std::map<hipMemcpyKind, std::string> memcpy_kinds;
+    if(std::find(memcpy_cmd.begin(), memcpy_cmd.end(), "all") != memcpy_cmd.end())
+    {
+        memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"});
+        memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"});
+        memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"});
+    }
+    else
+    {
+        for(std::string memcpy : memcpy_cmd)
+        {
+            if(memcpy == "htod")
+            {
+                memcpy_kinds.insert({hipMemcpyHostToDevice, "Host to Device"});
+            }
+            else if(memcpy == "dtoh")
+            {
+                memcpy_kinds.insert({hipMemcpyDeviceToHost, "Device to Host"});
+            }
+            else if(memcpy == "dtod")
+            {
+                memcpy_kinds.insert({hipMemcpyDeviceToDevice, "Device to Device"});
+            }
+            else
+            {
+                std::cerr << "Invalid memcpy!"
+                          << "\n";
+                exit(error_exit_code);
+            }
+        }
+    }
+
+    std::vector<unsigned long> memory_copy_measurement_sizes;
+    if(mode_of_test == TestMode::RANGED)
+    {
+        memory_copy_measurement_sizes
+            = generate_measurement_sizes_range(start_measurement,
+                                               end_measurement,
+                                               stride_between_measurements);
+    }
+    else
+    {
+        memory_copy_measurement_sizes = generate_measurement_sizes_shmoo();
+    }
+
+    std::cout << "Measurement Sizes: "
+              << format_range(memory_copy_measurement_sizes.begin(),
+                              memory_copy_measurement_sizes.end())
+              << "\n\n";
+
+    // Run the bandwidth tests on devices
+    for(auto device : devices)
+    {
+        hipDeviceProp_t devProp;
+        HIP_CHECK(hipSetDevice(device));
+        HIP_CHECK(hipGetDeviceProperties(&devProp, device));
+
+        for(auto memcpy_kind : memcpy_kinds)
+        {
+            std::string print_text;
+            if(memory_allocation == MemoryMode::PAGED)
+            {
+                print_text = "Paged Bandwidth ";
+            }
+            else if(memory_allocation == MemoryMode::PINNED)
+            {
+                print_text = "Pinned Bandwidth ";
+            }
+            if(memcpy_kind.first == hipMemcpyDeviceToDevice)
+            {
+                print_text = "Bandwidth ";
+            }
+
+            std::vector<double> bandwidth_measurements;
+            if(memcpy_kind.first == hipMemcpyDeviceToDevice)
+            {
+                bandwidth_measurements
+                    = run_bandwidth_device_device(memory_copy_measurement_sizes, device, trials);
+            }
+            else
+            {
+                bandwidth_measurements = run_bandwidth_host_device(memory_copy_measurement_sizes,
+                                                                   device,
+                                                                   memcpy_kind.first,
+                                                                   memory_allocation,
+                                                                   trials);
+            }
+            std::cout << "\nDevice ID [" << device << "] Device Name [" << devProp.name
+                      << "]: " << print_text << memcpy_kind.second << " (GB/s): "
+                      << format_range(bandwidth_measurements.begin(), bandwidth_measurements.end())
+                      << "\n\n";
+        }
+    }
+}
diff --git a/HIP-Basic/device_query/device_query_vs2019.sln b/HIP-Basic/device_query/device_query_vs2019.sln
index 9297291b..7dc7482a 100644
--- a/HIP-Basic/device_query/device_query_vs2019.sln
+++ b/HIP-Basic/device_query/device_query_vs2019.sln
@@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 16
 VisualStudioVersion = 16.0.32630.194
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_template_vs2019", "example_template_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "device_query_vs2019", "device_query_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/HIP-Basic/device_query/device_query_vs2019.vcxproj b/HIP-Basic/device_query/device_query_vs2019.vcxproj
index 980cf88c..841fe5b5 100644
--- a/HIP-Basic/device_query/device_query_vs2019.vcxproj
+++ b/HIP-Basic/device_query/device_query_vs2019.vcxproj
@@ -20,7 +20,7 @@
     <VCProjectVersion>15.0</VCProjectVersion>
     <ProjectGuid>{C2C6E811-57E3-44C5-9AB9-195D60A1638C}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>example_template_vs2019</RootNamespace>
+    <RootNamespace>device_query_vs2019</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj b/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj
index 6cfab55e..659e43e1 100644
--- a/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj
+++ b/HIP-Basic/dynamic_shared/dynamic_shared_vs2019.vcxproj
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
diff --git a/HIP-Basic/events/README.md b/HIP-Basic/events/README.md
index 4777662f..ff9bd0f9 100644
--- a/HIP-Basic/events/README.md
+++ b/HIP-Basic/events/README.md
@@ -1,6 +1,6 @@
 # HIP-Basic Events Example
 ## Description
-Memory transfer and kernel execution are the most important parameter in parallel computing (especially HPC and machine learning). Memory bottlenecks are the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization.
+Memory transfer and kernel execution are the most important parameters in parallel computing, especially in high performance computing (HPC) and machine learning. Memory bottlenecks are the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization.
 
 This example showcases measuring kernel and memory transfer timing using HIP events. The kernel under measurement is a trivial one that performs square matrix transposition.
 
@@ -8,11 +8,14 @@ This example showcases measuring kernel and memory transfer timing using HIP eve
 1. A number of parameters are defined that control the problem details and the kernel launch.
 2. Input data is set up in host memory.
 3. The necessary amount of device memory is allocated.
-4. A pair of `hipEvent` objects are defined and initialized. Time measurement is started on the `start` event.
-5. Memory transfer from host to device of the input data is performed, and the measurement is stopped using the `stop` event. The execution time is calculated via the `start` and `stop` events and it is printed to the standard output.
-6. The kernel is launched, and its runtime is measured similarly using the `start` and `stop` events.
-7. The result data is copied back to the host, and the execution time of the copy is measured similarly.
-8. The result data is validated by comparing it to the product of the reference (host) implementation. The result of the validation is printed to the standard output.
+4. A pair of `hipEvent` objects are defined and initialized.
+5. Time measurement is started on the `start` event.
+6. Memory transfer from host to device of the input data is performed.
+7. The time measurement is stopped using the `stop` event. The execution time is calculated via the `start` and `stop` events and it is printed to the standard output.
+8. The kernel is launched, and its runtime is measured similarly using the `start` and `stop` events.
+9. The result data is copied back to the host, and the execution time of the copy is measured similarly.
+10. The allocated device memory is freed and the event objects are released.
+11. The result data is validated by comparing it to the product of the reference (host) implementation. The result of the validation is printed to the standard output.
 
 ## Key APIs and Concepts
 - The `hipEvent_t` type defines HIP events that can be used for synchronization and time measurement. The events must be initialized using `hipEventCreate` before usage and destroyed using `hipEventDestroy` after they are no longer needed.
diff --git a/HIP-Basic/events/main.hip b/HIP-Basic/events/main.hip
index 8747c533..5549c5d0 100644
--- a/HIP-Basic/events/main.hip
+++ b/HIP-Basic/events/main.hip
@@ -27,7 +27,7 @@
 #include <iostream>
 #include <vector>
 
-#include <cstddef>
+#include <cstdlib>
 
 /// \brief Performs a simple matrix transpose on the GPU.
 __global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width)
@@ -154,10 +154,12 @@ int main()
     const auto ref_transposed_matrix = matrix_transpose_reference(h_matrix, width);
 
     // Check the results' validity.
-    constexpr double eps = 1.0E-6;
-    unsigned int     errors{};
+    constexpr float eps    = 1.0E-6F;
+    unsigned int    errors = 0;
     for(unsigned int i = 0; i < size; i++)
     {
+        // Most likely the values are bitwise equal, since they were plainly copied,
+        // however it is a good practice to compare floating point values using an epsilon.
         if(std::abs(h_transposed_matrix[i] - ref_transposed_matrix[i]) > eps)
         {
             errors++;
diff --git a/HIP-Basic/hipify/main.cu b/HIP-Basic/hipify/main.cu
index 1f30febd..a1f0abf5 100644
--- a/HIP-Basic/hipify/main.cu
+++ b/HIP-Basic/hipify/main.cu
@@ -89,7 +89,7 @@ int main()
     // Copy the input from host to the GPU device
     CHECK(cudaMemcpy(d_in, h_in.data(), size_in_bytes, cudaMemcpyHostToDevice));
 
-    // Set the the number of blocks per kernel grid.
+    // Set the number of blocks per kernel grid.
     constexpr unsigned int grid_size = 512;
     // Set the number of threads per kernel block.
     constexpr unsigned int threads_per_block = 256;
diff --git a/HIP-Basic/llvm_ir_to_executable/.gitignore b/HIP-Basic/llvm_ir_to_executable/.gitignore
new file mode 100644
index 00000000..85472506
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/.gitignore
@@ -0,0 +1,4 @@
+hip_llvm_ir_to_executable
+*.bc
+*.o
+*.hipfb
diff --git a/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt b/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt
new file mode 100644
index 00000000..9796dca6
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/CMakeLists.txt
@@ -0,0 +1,174 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_llvm_ir_to_executable)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+
+# Only supported on HIP (not CUDA)
+if(NOT "${GPU_RUNTIME}" STREQUAL "HIP")
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be HIP.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+if (NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    set(GPU_ARCHITECTURES "all" CACHE STRING "GPU architectures to compile for")
+else()
+    set(GPU_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "GPU architectures to compile for")
+endif()
+
+if(GPU_ARCHITECTURES STREQUAL "all")
+    set(GPU_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "GPU architectures to compile for" FORCE)
+endif()
+
+# Remove duplicates
+list(REMOVE_DUPLICATES GPU_ARCHITECTURES)
+message(STATUS "GPU_ARCHITECTURES: ${GPU_ARCHITECTURES}")
+
+set_source_files_properties(main.hip PROPERTIES COMPILE_OPTIONS "--cuda-host-only")
+
+if (WIN32)
+    set(OBJ_TYPE obj)
+    set(NULDEV NUL)
+    set(HOST_TARGET x86_64-pc-windows-msvc)
+    set(HIP_OBJ_GEN_FILE hip_obj_gen_win.mcin)
+else()
+    set(OBJ_TYPE o)
+    set(NULDEV /dev/null)
+    set(HOST_TARGET x86_64-unknown-linux)
+    set(HIP_OBJ_GEN_FILE hip_obj_gen.mcin)
+endif()
+
+# Assemble the device assemblies to object files using the HIP compiler.
+# The compiler needs -target amdgcn-amd-amdhsa -mcpu=gfx* in order to assemble the object file
+# for the right GPU.
+foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES})
+    message(STATUS "Generating main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+    add_custom_command(
+        OUTPUT main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+        COMMAND ${CMAKE_HIP_COMPILER} -fPIC -target amdgcn-amd-amdhsa -mcpu=${HIP_ARCHITECTURE}
+                        ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll
+                        -o ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/main_${HIP_ARCHITECTURE}.ll
+        VERBATIM)
+endforeach()
+
+# Create an offload-bundle from the assembled object files. This needs the clang-offload-bundler tool.
+find_program(
+    OFFLOAD_BUNDLER_COMMAND clang-offload-bundler
+    PATH_SUFFIXES bin
+    PATHS
+    ${ROCM_ROOT}/llvm
+    ${CMAKE_INSTALL_PREFIX}/llvm
+    REQUIRED)
+
+if(OFFLOAD_BUNDLER_COMMAND)
+    message(STATUS "clang-offload-bundler found: ${CLANG_OFFLOAD_BUNDLER}")
+else()
+    message(FATAL_ERROR "clang-offload-bundler not found")
+endif()
+
+# Generate object bundle.
+# The invocation to generate is
+# clang-offload-bundler -targets=<targets> -input=<input target #1> -inputs=<input target #2> ... -output=<output>
+# Note that the host target must be the first target present here, and it should have an empty input associated to it.
+
+# Generate BUNDLE_TARGETS as a string of: -targets=host-${HOST_TARGET},hip-amdgcn-amd-amdhsa-${HIP_ARCHITECTURE},...
+set(BUNDLE_TARGETS "-targets=host-${HOST_TARGET}")
+# Generate BUNDLE_INPUTS as a string of: -input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE} ...
+set(BUNDLE_INPUTS "-input=${NULDEV}")
+# Generate BUNDLE_OBJECTS as a string of: ${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}
+set(BUNDLE_OBJECTS "")
+foreach(HIP_ARCHITECTURE ${GPU_ARCHITECTURES})
+    set(BUNDLE_TARGETS "${BUNDLE_TARGETS},hipv4-amdgcn-amd-amdhsa--${HIP_ARCHITECTURE}")
+    list(APPEND BUNDLE_INPUTS "-input=${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+    list(APPEND BUNDLE_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/main_${HIP_ARCHITECTURE}.${OBJ_TYPE}")
+endforeach()
+
+# Invoke clang-offload-bundler to generate an offload bundle.
+set(BUNDLE "${CMAKE_CURRENT_BINARY_DIR}/offload_bundle.hipfb")
+add_custom_command(
+    OUTPUT "${BUNDLE}"
+    COMMAND
+        "${OFFLOAD_BUNDLER_COMMAND}"
+        -type=o
+        -bundle-align=4096
+        "${BUNDLE_TARGETS}"
+        ${BUNDLE_INPUTS}
+        "-output=${BUNDLE}"
+    DEPENDS ${BUNDLE_OBJECTS}
+    VERBATIM)
+
+# Create the device binary by assembling the template that includes
+# the offload bundle that was just generated using an .incbin directive.
+# This needs an assembler.
+find_program(
+    LLVM_MC_COMMAND llvm-mc
+    PATH_SUFFIXES bin
+    PATHS
+    ${ROCM_ROOT}/llvm
+    ${CMAKE_INSTALL_PREFIX}/llvm)
+
+if(LLVM_MC_COMMAND)
+    message(STATUS "llvm-mc found: ${LLVM_MC_COMMAND}")
+else()
+    message(FATAL_ERROR "llvm-mc not found")
+endif()
+
+# Invoke llvm-mc to generate an object file containing the offload bundle.
+set(DEVICE_OBJECT "${CMAKE_CURRENT_BINARY_DIR}/main_device.${OBJ_TYPE}")
+add_custom_command(
+    OUTPUT "${DEVICE_OBJECT}"
+    COMMAND
+        "${LLVM_MC_COMMAND}"
+        -triple "${HOST_TARGET}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_OBJ_GEN_FILE}"
+        -o "${DEVICE_OBJECT}"
+        --filetype=obj
+    DEPENDS "${BUNDLE}"
+    VERBATIM)
+
+# Finally, create the executable.
+add_executable(
+    ${example_name}
+    main.hip
+    ${DEVICE_OBJECT})
+
+# Make example runnable using ctest
+add_test(${example_name} ${example_name})
+
+set(include_dirs "../../Common")
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/llvm_ir_to_executable/Makefile b/HIP-Basic/llvm_ir_to_executable/Makefile
new file mode 100644
index 00000000..5fb8b2d0
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/Makefile
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+EXAMPLE := hip_llvm_ir_to_executable
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME ?= HIP
+
+ifneq ($(GPU_RUNTIME), HIP)
+$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be HIP.)
+endif
+
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX 				  ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+CLANG                 ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang
+LLVM_MC               ?= $(ROCM_INSTALL_DIR)/llvm/bin/llvm-mc
+CLANG_OFFLOAD_BUNDLER ?= $(ROCM_INSTALL_DIR)/llvm/bin/clang-offload-bundler
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+# Compile for these GPU architectures
+HIP_ARCHITECTURES ?= gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030
+
+# If white-space is given as a literal the `subst` cannot recognize it.
+# There this `empty` `space` hack is used in the tokenizing of GPU_TARGETS
+# and the creation of GPU_ARCH_TRIPLES, which is later passed to CLANG_OFFLOAD_BUNDLER
+# in the targets option. The targets option needs to be a single string with no spaces.
+empty =
+space = $(empty) $(empty)
+comma = ,
+
+GPU_ARCHS := $(subst ;,$(space),$(HIP_ARCHITECTURES))
+GPU_ARCH_TRIPLES := $(subst $(space),$(comma),$(GPU_ARCHS:%=hipv4-amdgcn-amd-amdhsa--%))
+
+all: $(EXAMPLE)
+
+$(EXAMPLE): main.o main_device.o
+	$(HIPCXX) -o $@ $^
+
+main_device.o: hip_obj_gen.mcin offload_bundle.hipfb
+	$(LLVM_MC) -triple x86_64-unknown-linux-gnu -o $@ $< --filetype=obj
+
+offload_bundle.hipfb: $(GPU_ARCHS:%=main_%.o)
+	$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 \
+		-targets=host-x86_64-unknown-linux,$(GPU_ARCH_TRIPLES) \
+		-input=/dev/null \
+		$(^:%=-input=%) \
+		-output=$@
+
+main.o: main.hip
+	$(HIPCXX) $(CXXFLAGS) $(CPPFLAGS) -c --cuda-host-only $<
+
+main_%.o: main_%.ll
+	$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$* -o $@ $<
+
+clean:
+	rm -f \
+		main_device*.o \
+		main_*.bc \
+		offload_bundle.hipfb \
+		main_device.o \
+		main.o \
+		$(EXAMPLE)
+
+.PHONY: clean $(EXAMPLE)
diff --git a/HIP-Basic/llvm_ir_to_executable/README.md b/HIP-Basic/llvm_ir_to_executable/README.md
new file mode 100644
index 00000000..5189bc80
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/README.md
@@ -0,0 +1,117 @@
+# HIP-Basic LLVM-IR to Executable Example
+
+## Description
+This example shows how to manually compile and link a HIP application from device LLVM IR. Pre-generated LLVM-IR files are compiled into an _offload bundle_, a bundle of device object files, and then linked with the host object file to produce the final executable.
+
+LLVM IR is the intermediary language used by the LLVM compiler, which hipcc is built on. Building HIP executables from LLVM IR can be useful for example to experiment with specific LLVM instructions, or can help debugging miscompilations.
+
+### Building
+
+- Build with Makefile: to compile for specific GPU architectures, optionally provide the HIP_ARCHITECTURES variable. Provide the architectures separated by comma.
+    ```shell
+    make HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030"
+    ```
+- Build with CMake:
+    ```shell
+    cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030"
+    cmake --build build
+    ```
+    On Windows the path to RC compiler may be needed: `-DCMAKE_RC_COMPILER="C:/Program Files (x86)/Windows Kits/path/to/x64/rc.exe"`
+
+## Generating device LLVM IR
+In this example, a HIP executable is compiled from device LLVM IR code. LLVM IR can be written completely manually, but in this example they are generated from `main.hip`, using the following commands:
+```shell
+$ROCM_INSTALL_DIR/bin/hipcc -cuda-device-only -c -emit-llvm ./main.hip --offload-arch=<arch> -o main_<arch>.bc -I ../../Common
+$ROCM_INSTALL_DIR/bin/llvm-dis main_<arch>.bc -o main_<arch>.ll
+```
+Where `<arch>` is the architecture to generate the LLVM IR for. Note that the `--cuda-device-only` flag is required to instruct `hipcc` to only generate LLVM IR for the device part of the computation, and `-c` is required to prevent the compiler from linking the ouputs into an executable. In the case of this example, the LLVM IR files where generated using architectures `gfx803`, `gfx900`, `gfx906`, `gfx908`, `gfx90a`, `gfx1030`. The user may modify the `--offload-arch` flag to build for other architectures and choose to either enable or disable extra device code-generation features such as `xnack` or `sram-ecc`, which can be specified as `--offload-arch=<arch>:<feature>+` to enable it or `--offload-arch=<arch>:<feature>-` to disable it. Multiple features may be present, separated by colons.
+
+The first of these two commands generates a _bitcode_ module: this is a binary encoded version of LLVM IR. The second command, using `llvm-dis` disassembles the bitcode module into textual LLVM IR.
+
+## Build Process
+A HIP binary consists of a regular host executable, which has an offload bundle containing device code embedded inside it. This offload bundle contains object files for each of the target devices that it is compiled for, and is loaded at runtime to provide the machine code for the current device. A HIP executable can be built from device LLVM IR and host HIP code according to the following process:
+
+1. The `main.hip` file is compiled to an object file with `hipcc` that only contains host code by using the `--cuda-host-only` option. `main.hip` is a program that launches a simple kernel to compute the square of each element of a vector. The `-c` option is required to prevent the compiler from creating an executable, and make it create an object file containing the compiled host code instead.
+    ```shell
+    $ROCM_INSTALL_DIR/bin/hipcc -c --cuda-host-only main.hip
+    ```
+
+2. Each LLVM IR file is assembled to a device object file using `clang`. This requires specifying the correct architecture using `-target amdgcn-amd-amdhsa`, and the target architecture that should be assembled for using `-mcpu`:
+
+    ```shell
+    $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx1030 main_gfx1030.ll -o main_gfx1030.o
+    $ROCM_INSTALL_DIR/llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=<arch> main_<arch>.ll -o main_<arch>.o
+    ...
+    ```
+
+3. The device object files are combined into an offload bundle using `clang-offload-bundler`. This requires specifying the target as well as the offload kind for each device, in the form `<offload-kind>-<target>-<arch>`. For HIP device code, `<offload-kind>` is `hipv4`. Note that this command requires an (empty) entry for the host to also be present, with `<offload-kind>` `host`. The order of targets and inputs must match. `<target>` is an LLVM target triple, which is specified as `<isa>-<vendor>-<os>-<abi>`. `<abi>` is left empty for AMD targets.
+
+    ```shell
+    $ROCM_INSTALL_DIR/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 \
+            -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx1030,hipv4-... \
+            -input=/dev/null \
+            -input=main_gfx1030.o -input=... \
+            -output=offload_bundle.hipfb
+    ```
+
+    Note: using -bundle-align=4096 only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same `--offload-arch` as when compiling the source to LLVM bitcode.
+
+4. The offload bundle is embedded inside an object file that can be linked with the object file containing the host code. The offload bundle must be placed in the `.hip_fatbin` section, and must be placed after the symbol `__hip_fatbin`. This can be done by creating an assembly file that places the offload bundle in the appropriate section using the `.incbin` directive:
+    ```nasm
+        .type __hip_fatbin,@object
+        ; Tell the assembler to place the offload bundle in the appropriate section.
+        .section .hip_fatbin,"a",@progbits
+        ; Make the symbol that addresses the binary public
+        .globl __hip_fatbin
+        ; Give the bundle the required alignment
+        .p2align 12
+    __hip_fatbin:
+        ; Include the binary
+        .incbin "offload_bundle.hipfb"
+    ```
+    This file can then be assembled using `llvm-mc` as follows:
+    ```shell
+    $ROCM_INSTALL_DIR/llvm/bin/llvm-mc -triple <host target> -o main_device.o hip_obj_gen.mcin --filetype=obj
+    ```
+
+5. Finally, using the system linker, `hipcc`, or `clang`, the host object and device objects are linked into an executable:
+    ```shell
+    <ROCM_PATH>/hip/bin/hipcc -o hip_llvm_ir_to_executable main.o main_device.o
+    ```
+
+### Visual Studio 2019
+The above compilation steps are implemented in Visual Studio through Custom Build Steps and Custom Build Tools:
+- The host compilation from step 1 is performed by adding extra options to the source file, under `main.hip -> properties -> C/C++ -> Command Line`:
+    ```
+    Additional Options: --cuda-host-only
+    ```
+- Each device LLVM IR .ll file has a custom build tool associated to it, which performs the operation associated to step 2 from the previous section:
+    ```
+    Command Line: "$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a
+    Description: Compiling Device Assembly %(Identity)
+    Output: $(IntDir)%(FileName).o
+    Execute Before: ClCompile
+    ```
+- Steps 3 and 4 are implemented using a custom build step:
+    ```
+    Command Line:
+      "$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+      cd $(IntDir) && "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    Description: Generating Device Offload Object
+    Outputs: $(IntDIr)main_device.obj
+    Additional Dependencies: $(IntDir)main_gfx90a.o;$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)
+    Execute Before: ClCompile
+    ```
+- Finally step 5 is implemented by passing additional inputs to the linker in `project -> properties -> Linker -> Input`:
+    ```
+    Additional Dependencies: $(IntDir)main_device.obj;%(AdditionalDependencies)
+    ```
+
+## Used API surface
+### HIP runtime
+- `hipFree`
+- `hipGetDeviceProperties`
+- `hipGetLastError`
+- `hipLaunchKernelGGL`
+- `hipMalloc`
+- `hipMemcpy`
diff --git a/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin
new file mode 100644
index 00000000..6b9fee5f
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen.mcin
@@ -0,0 +1,21 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#       HIP Object Generator
+# Use this generator to create a host bundled object file
+# with the input of an offload bundled fat binary.
+#
+# Input: Bundled Object file .hipfb file
+# Output: Host Bundled Object File .o
+
+    .type __hip_fatbin,@object
+    # Tell the assembler to place the offload bundle in the appropriate section.
+    .section .hip_fatbin,"a",@progbits
+    # Make the symbol that addresses the binary public.
+    .globl __hip_fatbin
+    # Give the bundle the required alignment of 4096 (2 ^ 12).
+    .p2align 12
+__hip_fatbin:
+    # Include the offload bundle.
+    .incbin "offload_bundle.hipfb"
diff --git a/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin
new file mode 100644
index 00000000..3636354e
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/hip_obj_gen_win.mcin
@@ -0,0 +1,20 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#       HIP Object Generator
+# Use this generator to create a host bundled object file
+# with the input of an offload bundled fat binary.
+#
+# Input: Bundled Object file .hipfb file
+# Output: Host Bundled Object File .o
+
+    # Tell the assembler to place the offload bundle in the appropriate section.
+    .section .hip_fatbin,"dw"
+    # Make the symbol that addresses the binary public.
+    .globl __hip_fatbin
+    # Give the bundle the required alignment of 4096 (2 ^ 12).
+    .p2align 12
+__hip_fatbin:
+    # Include the offload bundle.
+    .incbin "offload_bundle.hipfb"
diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln
new file mode 100644
index 00000000..a53dc2ec
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {0A13532C-E06B-4427-9847-54070C1E8622}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj
new file mode 100644
index 00000000..c0e820b4
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj
@@ -0,0 +1,183 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--cuda-host-only</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">--cuda-host-only</AdditionalOptions>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="hip_obj_gen_win.mcin">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Copying %(Identity)</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Identity)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(IntDir)%(Identity)"</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Copying %(Identity)</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Identity)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx1030.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx1030</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx803.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa  -mcpu=gfx803</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa  -mcpu=gfx803</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx900.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang+"+ -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx900</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx906.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx906</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx908.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx908</Command>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx90a.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa -mcpu=gfx90a </Command>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{dbb8dfe9-cb1b-473c-937c-2a8120e0d819}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>llvm_ir_to_executable_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+    <CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+    <CustomBuildBeforeTargets>ClCompile</CustomBuildBeforeTargets>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CustomBuild>
+      <Message>Compiling Device LLVM IR %(Identity)</Message>
+      <Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
+      <Outputs>$(IntDir)%(FileName).o</Outputs>
+    </CustomBuild>
+    <CustomBuildStep>
+      <Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=nul "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Message>Generating Device Offload Object</Message>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(IntDIr)main_device.obj</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(IntDir)main_device.obj;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CustomBuild>
+      <Message>Compiling Device LLVM IR %(Identity)</Message>
+      <Command>"$(ClangToolPath)clang++" -o "$(IntDir)%(FileName).o" "%(Identity)" -target amdgcn-amd-amdhsa</Command>
+      <Outputs>$(IntDir)%(FileName).o</Outputs>
+    </CustomBuild>
+    <CustomBuildStep>
+      <Command>"$(ClangToolPath)clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-pc-windows-msvc,hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900,hipv4-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa-gfx90a,hipv4-amdgcn-amd-amdhsa--gfx1030 -input=NUL "-input=$(IntDir)main_gfx803.o" "-input=$(IntDir)main_gfx900.o" "-input=$(IntDir)main_gfx906.o" "-input=$(IntDir)main_gfx908.o" "-input=$(IntDir)main_gfx90a.o" "-input=$(IntDir)main_gfx1030.o" "-output=$(IntDir)offload_bundle.hipfb"
+cd $(IntDir) &amp;&amp; "$(ClangToolPath)llvm-mc" -triple host-x86_64-pc-windows-msvc "hip_obj_gen_win.mcin" -o "main_device.obj" --filetype=obj</Command>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Message>Generating Device Offload Object</Message>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Outputs>$(IntDIr)main_device.obj</Outputs>
+    </CustomBuildStep>
+    <CustomBuildStep>
+      <Inputs>$(IntDir)main_gfx803.o;$(IntDir)main_gfx900.o;$(IntDir)main_gfx906.o;$(IntDir)main_gfx908.o;$(IntDir)main_gfx90a.o;$(IntDir)main_gfx1030.o;$(IntDir)hip_objgen_win.mcin;%(Inputs)</Inputs>
+    </CustomBuildStep>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters
new file mode 100644
index 00000000..25c408b7
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/llvm_ir_to_executable_vs2019.vcxproj.filters
@@ -0,0 +1,53 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4f2a1544-a556-4afb-b630-36ba54c0ab4a}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{b93521e0-9944-411a-9f6e-4071af6bc7ea}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{972f07c3-b925-4516-bd65-2d5a3f626888}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="main_gfx90a.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx803.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx900.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx906.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx908.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx90a.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="main_gfx1030.ll">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="hip_obj_gen_win.mcin">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+  </ItemGroup>
+</Project>
diff --git a/HIP-Basic/llvm_ir_to_executable/main.hip b/HIP-Basic/llvm_ir_to_executable/main.hip
new file mode 100644
index 00000000..588fc070
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main.hip
@@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+/// \brief Device function to square each element
+/// in the array `in` and write to array `out`.
+template<typename T>
+__global__ void vector_square_kernel(T* out, const T* in, const long long size)
+{
+    // Get the unique global thread ID
+    const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+    // Each thread hops stride amount of elements to find the next
+    // element to square
+    const size_t stride = blockDim.x * gridDim.x;
+
+    for(size_t i = offset; i < size; i += stride)
+    {
+        out[i] = in[i] * in[i];
+    }
+}
+
+int main()
+{
+    // Set the problem size
+    constexpr size_t size          = 1000000;
+    constexpr size_t size_in_bytes = size * sizeof(float);
+
+    hipDeviceProp_t props;
+    HIP_CHECK(hipGetDeviceProperties(&props, 0 /*deviceID*/));
+    std::cout << "info: running on device " << props.name << "\n";
+
+    std::cout << "info: allocate host mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB) "
+              << "\n";
+
+    // Declare the host side arrays
+    std::vector<float> h_in(size);
+    std::vector<float> h_out(size);
+
+    // Initialize the host size input
+    for(size_t i = 0; i < size; i++)
+    {
+        h_in[i] = 1.618f + i;
+    }
+
+    // Declare the device side arrays
+    float *d_in, *d_out;
+    std::cout << "info: allocate device mem (" << 2 * size_in_bytes / 1024.0 / 1024.0 << " MiB)\n";
+    // Allocate the device side memory
+    HIP_CHECK(hipMalloc(&d_in, size_in_bytes));
+    HIP_CHECK(hipMalloc(&d_out, size_in_bytes));
+
+    std::cout << "info: copy Host2Device\n";
+
+    // Copy the input from host to the GPU device
+    HIP_CHECK(hipMemcpy(d_in, h_in.data(), size_in_bytes, hipMemcpyHostToDevice));
+
+    // Set the number of blocks per kernel grid.
+    constexpr unsigned int grid_size = 512;
+    // Set the number of threads per kernel block.
+    constexpr unsigned int threads_per_block = 256;
+
+    std::cout << "info: launch 'vector_square_kernel' kernel\n";
+    hipLaunchKernelGGL(vector_square_kernel,
+                       grid_size,
+                       threads_per_block,
+                       0,
+                       hipStreamDefault,
+                       d_out,
+                       d_in,
+                       size);
+
+    // Check that the kernel invocation was successful.
+    HIP_CHECK(hipGetLastError());
+
+    std::cout << "info: copy Device2Host\n";
+    HIP_CHECK(hipMemcpy(h_out.data(), d_out, size_in_bytes, hipMemcpyDeviceToHost));
+
+    HIP_CHECK(hipFree(d_in));
+    HIP_CHECK(hipFree(d_out));
+
+    std::cout << "info: check result\n";
+    for(size_t i = 0; i < size; i++)
+    {
+        if(h_out[i] != h_in[i] * h_in[i])
+        {
+            std::cerr << "FAILED! h_out[" << i << "] = " << h_out[i]
+                      << ", expected:  " << h_in[i] * h_in[i] << '\n';
+            exit(error_exit_code);
+        }
+    }
+    std::cout << "PASSED!\n";
+}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll
new file mode 100644
index 00000000..31c713de
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx1030.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx1030.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll
new file mode 100644
index 00000000..a0d9f588
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx803.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx803.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx803" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll
new file mode 100644
index 00000000..67ff0a30
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx900.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx900.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll
new file mode 100644
index 00000000..76819daf
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx906.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx906.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll
new file mode 100644
index 00000000..50a94f21
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx908.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx908.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx908" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll b/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll
new file mode 100644
index 00000000..dc293da3
--- /dev/null
+++ b/HIP-Basic/llvm_ir_to_executable/main_gfx90a.ll
@@ -0,0 +1,97 @@
+; ModuleID = 'main_gfx90a.bc'
+source_filename = "./main.hip"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" = type { i8 }
+%"struct.__HIP_Coordinates<__HIP_GridDim>::__X" = type { i8 }
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = comdat any
+
+$_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = comdat any
+
+@_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" undef, comdat, align 1
+@_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE = weak protected addrspace(4) externally_initialized constant %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" undef, comdat, align 1
+@llvm.compiler.used = appending addrspace(1) global [4 x i8*] [i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_GridDim>::__X", %"struct.__HIP_Coordinates<__HIP_GridDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockDim>::__X", %"struct.__HIP_Coordinates<__HIP_BlockDim>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X", %"struct.__HIP_Coordinates<__HIP_BlockIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(4)* getelementptr inbounds (%"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X", %"struct.__HIP_Coordinates<__HIP_ThreadIdx>::__X" addrspace(4)* @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind
+define protected amdgpu_kernel void @_Z20vector_square_kernelIfEvPT_PKS0_x(float addrspace(1)* nocapture %0, float addrspace(1)* nocapture readonly %1, i64 %2) local_unnamed_addr #0 {
+  %4 = tail call i32 @llvm.amdgcn.workgroup.id.x() #2
+  %5 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %6 = getelementptr inbounds i8, i8 addrspace(4)* %5, i64 12
+  %7 = bitcast i8 addrspace(4)* %6 to i32 addrspace(4)*
+  %8 = load i32, i32 addrspace(4)* %7, align 4, !tbaa !4
+  %9 = getelementptr i8, i8 addrspace(4)* %5, i64 4
+  %10 = bitcast i8 addrspace(4)* %9 to i16 addrspace(4)*
+  %11 = load i16, i16 addrspace(4)* %10, align 4, !range !13, !invariant.load !14
+  %12 = zext i16 %11 to i32
+  %13 = mul i32 %4, %12
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !15
+  %15 = add i32 %13, %14
+  %16 = zext i32 %15 to i64
+  %17 = zext i32 %8 to i64
+  %18 = icmp ult i64 %16, %2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %20, %3
+  ret void
+
+20:                                               ; preds = %3, %20
+  %21 = phi i64 [ %26, %20 ], [ %16, %3 ]
+  %22 = getelementptr inbounds float, float addrspace(1)* %1, i64 %21
+  %23 = load float, float addrspace(1)* %22, align 4, !tbaa !16
+  %24 = fmul contract float %23, %23
+  %25 = getelementptr inbounds float, float addrspace(1)* %0, i64 %21
+  store float %24, float addrspace(1)* %25, align 4, !tbaa !16
+  %26 = add i64 %21, %17
+  %27 = icmp ult i64 %26, %2
+  br i1 %27, label %20, label %19, !llvm.loop !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind "amdgpu-flat-work-group-size"="1,1024" "amdgpu-implicitarg-num-bytes"="56" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.ocl.version = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 1}
+!2 = !{i32 2, i32 0}
+!3 = !{!"AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.0.0 22051 235b6880e2e515507478181ec11a20c1ec87945b)"}
+!4 = !{!5, !9, i64 12}
+!5 = !{!"hsa_kernel_dispatch_packet_s", !6, i64 0, !6, i64 2, !6, i64 4, !6, i64 6, !6, i64 8, !6, i64 10, !9, i64 12, !9, i64 16, !9, i64 20, !9, i64 24, !9, i64 28, !10, i64 32, !11, i64 40, !10, i64 48, !12, i64 56}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!"int", !7, i64 0}
+!10 = !{!"long", !7, i64 0}
+!11 = !{!"any pointer", !7, i64 0}
+!12 = !{!"hsa_signal_s", !10, i64 0}
+!13 = !{i16 1, i16 1025}
+!14 = !{}
+!15 = !{i32 0, i32 1024}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"float", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C++ TBAA"}
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.mustprogress"}
diff --git a/HIP-Basic/matrix_multiplication/Makefile b/HIP-Basic/matrix_multiplication/Makefile
index 151aa45b..ba6d2ade 100644
--- a/HIP-Basic/matrix_multiplication/Makefile
+++ b/HIP-Basic/matrix_multiplication/Makefile
@@ -45,7 +45,7 @@ else
 $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
 endif
 
-$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/cmdparser.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
 	$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
 
 clean:
diff --git a/HIP-Basic/matrix_multiplication/argument_parsing.hpp b/HIP-Basic/matrix_multiplication/argument_parsing.hpp
deleted file mode 100644
index 80ffc270..00000000
--- a/HIP-Basic/matrix_multiplication/argument_parsing.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// MIT License
-//
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#ifndef HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP
-#define HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP
-
-#include "example_utils.hpp"
-
-#include <charconv>
-#include <iostream>
-#include <string>
-#include <string_view>
-
-#include <cstdlib>
-
-/// \brief Tries to read the matrix dimensions from the command line.
-/// If no command line arguments were provided, the passed values are not modified.
-/// Otherwise, the number of arguments must be 3: <A rows> <A columns> <B columns>
-/// (B rows will be equal to A columns).
-/// If the number of arguments is different, or the arguments cannot be parsed to
-/// unsigned ints, an error message is printed and the program exits with a non-zero code.
-inline void matrix_dimensions_from_command_line(const int          argc,
-                                                const char*        argv[],
-                                                unsigned int&      a_rows,
-                                                unsigned int&      a_cols,
-                                                unsigned int&      b_cols,
-                                                const unsigned int block_size)
-{
-    const auto print_usage_and_exit = [=]()
-    {
-        const std::string usage_message
-            = "Calculates matrix product A*B.\n"
-              "Usage: hip_matrix_multiplication [<A rows> <A columns> <B columns>].\n"
-              "Matrix dimensions must be positive multiples of block_size ("
-              + std::to_string(block_size) + ")";
-        std::cout << usage_message << std::endl;
-        exit(error_exit_code);
-    };
-    const auto get_argument_by_index = [=](const unsigned int index) -> unsigned int
-    {
-        const std::string_view argument_text(argv[index]);
-
-        unsigned int converted_value;
-        const auto   conversion_result = std::from_chars(argument_text.data(),
-                                                       argument_text.data() + argument_text.size(),
-                                                       converted_value);
-        if(conversion_result.ec != std::errc{} || (converted_value % block_size) != 0)
-        {
-            print_usage_and_exit();
-        }
-        return converted_value;
-    };
-
-    if(argc == 1)
-    {
-        return;
-    }
-    if(argc != 4)
-    {
-        print_usage_and_exit();
-    }
-    a_rows = get_argument_by_index(1);
-    a_cols = get_argument_by_index(2);
-    b_cols = get_argument_by_index(3);
-}
-
-#endif // HIP_BASIC_MATRIX_MULTIPLICATION_ARGUMENT_PARSING_HPP
diff --git a/HIP-Basic/matrix_multiplication/main.hip b/HIP-Basic/matrix_multiplication/main.hip
index e651856a..90836e37 100644
--- a/HIP-Basic/matrix_multiplication/main.hip
+++ b/HIP-Basic/matrix_multiplication/main.hip
@@ -20,7 +20,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
-#include "argument_parsing.hpp"
+#include "cmdparser.hpp"
 #include "example_utils.hpp"
 
 #include <hip/hip_runtime.h>
@@ -108,18 +108,53 @@ __global__ void matrix_multiplication_kernel(const float*       A,
     // Every thread stores the final result to global memory.
     C[block_offset + b_cols * ty + tx] = thread_result;
 }
+template<unsigned int BlockSize>
+void configure_parser(cli::Parser& parser)
+{
+    // Default parameters
+    constexpr unsigned int a_rows = 2048;
+    constexpr unsigned int a_cols = 1024;
+    constexpr unsigned int b_cols = 1024;
+
+    static_assert(
+        ((a_rows % BlockSize == 0) && (a_cols % BlockSize == 0) && (b_cols % BlockSize == 0)),
+        "Matrix dimensions must be positive multiples of block_size");
+
+    parser.set_optional<unsigned int>("A_rows",
+                                      "A_rows",
+                                      a_rows,
+                                      "Number of rows in Matrix A"); // Default 2048
+    parser.set_optional<unsigned int>("A_cols",
+                                      "A_cols",
+                                      a_cols,
+                                      "Number of columns in Matrix A"); // Default 1024
+    parser.set_optional<unsigned int>("B_cols",
+                                      "B_cols",
+                                      b_cols,
+                                      "Number of columns in Matrix B"); // Default 1024
+}
 
 int main(int argc, const char* argv[])
 {
     constexpr unsigned int block_size = 16;
 
-    // Default values are provided below.
-    unsigned int a_rows = 2048;
-    unsigned int a_cols = 1024;
-    unsigned int b_cols = 1024;
+    // Parse user inputs
+    cli::Parser parser(argc, argv);
+    configure_parser<block_size>(parser);
+    parser.run_and_exit_if_error();
 
     // Get matrix dimensions from the command line, if provided.
-    matrix_dimensions_from_command_line(argc, argv, a_rows, a_cols, b_cols, block_size);
+    const unsigned int a_rows = parser.get<unsigned int>("A_rows");
+    const unsigned int a_cols = parser.get<unsigned int>("A_cols");
+    const unsigned int b_cols = parser.get<unsigned int>("B_cols");
+
+    if((a_rows % block_size != 0) || (a_cols % block_size != 0) || (b_cols % block_size != 0))
+    {
+        std::cout << "Matrix dimensions must be positive multiples of block_size ("
+                         + std::to_string(block_size) + ")"
+                  << std::endl;
+        exit(error_exit_code);
+    }
 
     // Outer matrix dimensions must match.
     const unsigned int b_rows = a_cols;
diff --git a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln
index 9297291b..b11412dc 100644
--- a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln
+++ b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.sln
@@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 16
 VisualStudioVersion = 16.0.32630.194
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example_template_vs2019", "example_template_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_multiplication_vs2019", "matrix_multiplication_vs2019.vcxproj", "{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj
index 4d1790e6..81bac082 100644
--- a/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj
+++ b/HIP-Basic/matrix_multiplication/matrix_multiplication_vs2019.vcxproj
@@ -1,97 +1,101 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="main.hip" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\Common\example_utils.hpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>matrix_multiplication_vs2019</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level1</WarningLevel>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level2</WarningLevel>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>matrix_multiplication_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/occupancy/README.md b/HIP-Basic/occupancy/README.md
index aced51a6..d641278e 100644
--- a/HIP-Basic/occupancy/README.md
+++ b/HIP-Basic/occupancy/README.md
@@ -1,7 +1,7 @@
-# HIP-Basic Occupany Example
+# HIP-Basic Occupancy Example
 
 ## Description
-This example showcases how to find optimal configuation parameters for a kernel launch with maximum occupancy. It uses the HIP occupancy calculator APIs to find a kernel launch configuration that yields maximum occupancy. This configuration is used to launch a kernel and measures the utilization difference against another kernel launch that is manually (and suboptimally) configured. The application kernel is a simple vector-vector multiplication of the form `C[i] = A[i]*B[i]`, where `A`, `B` and `C` are vectors of size `size`.
+This example showcases how to find optimal configuration parameters for a kernel launch with maximum occupancy. It uses the HIP occupancy calculator APIs to find a kernel launch configuration that yields maximum occupancy. This configuration is used to launch a kernel and measures the utilization difference against another kernel launch that is manually (and suboptimally) configured. The application kernel is a simple vector--vector multiplication of the form `C[i] = A[i]*B[i]`, where `A`, `B` and `C` are vectors of size `size`.
 
 The example shows 100% occupancy for both manual and automatic configurations, because the simple kernel does not use much resources per-thread or per-block, especially `__shared__` memory. The execution time for the automatic launch is still lower because of a lower overhead associated with fewer blocks being executed.
 
diff --git a/HIP-Basic/occupancy/occupancy_vs2019.vcxproj b/HIP-Basic/occupancy/occupancy_vs2019.vcxproj
index b8eee422..a1c151fa 100644
--- a/HIP-Basic/occupancy/occupancy_vs2019.vcxproj
+++ b/HIP-Basic/occupancy/occupancy_vs2019.vcxproj
@@ -1,95 +1,99 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="main.hip" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\Common\example_utils.hpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>occupancy_vs2019</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level1</WarningLevel>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level2</WarningLevel>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{e5b2fc79-3928-47f6-b57b-33aaa3c5d9c5}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>occupancy_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/runtime_compilation/.gitignore b/HIP-Basic/runtime_compilation/.gitignore
new file mode 100644
index 00000000..080d9030
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/.gitignore
@@ -0,0 +1 @@
+hip_runtime_compilation
diff --git a/HIP-Basic/runtime_compilation/CMakeLists.txt b/HIP-Basic/runtime_compilation/CMakeLists.txt
new file mode 100644
index 00000000..47974f94
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/CMakeLists.txt
@@ -0,0 +1,70 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_runtime_compilation)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+set(GPU_RUNTIMES "HIP" "CUDA")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
+
+if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest
+add_test(${example_name} ${example_name})
+
+set(link_libs "")
+set(include_dirs "../../Common")
+
+if(GPU_RUNTIME STREQUAL "HIP")
+    # Link hiprtc library
+    find_library(HIPRTC_LIB hiprtc REQUIRED)
+    list(APPEND link_libs "${HIPRTC_LIB}")
+endif()
+
+if(GPU_RUNTIME STREQUAL "CUDA")
+    # Include the HIP header directory.
+    list(APPEND include_dirs "${ROCM_ROOT}/include")
+    # In this example we also need to link nvrtc CUDA library
+    find_package("CUDAToolkit" REQUIRED)
+    list(APPEND link_libs "CUDA::nvrtc")
+endif()
+    
+target_link_libraries(${example_name} ${link_libs})
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/runtime_compilation/Makefile b/HIP-Basic/runtime_compilation/Makefile
new file mode 100644
index 00000000..65ba415d
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/Makefile
@@ -0,0 +1,55 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := hip_runtime_compilation
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME := HIP
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+	CXXFLAGS += -x cu
+	CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+	LDLIBS   += -l nvrtc
+else ifeq ($(GPU_RUNTIME), HIP)
+else
+	$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
+	$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/HIP-Basic/runtime_compilation/README.md b/HIP-Basic/runtime_compilation/README.md
new file mode 100644
index 00000000..a1c7e39e
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/README.md
@@ -0,0 +1,91 @@
+# HIP-Basic Runtime Compilation Example
+
+## Description
+
+Runtime compilation allows compiling fragments of source code to machine code at runtime, when a program is already running, rather than compiling the code ahead of time. HIP supports runtime compilation through hipRTC, which can be used to compile HIP device code at runtime. This permits specific optimizations that depend on values determined at runtime. Therefore, usage of hipRTC provides the possibility of obtaining optimizations and performance improvements over offline compilation.
+
+This example showcases how to make use of hipRTC to compile in runtime a kernel and launch it on a device. This kernel is a simple SAXPY, i.e. a single-precision operation $y_i=ax_i+y_i$.
+
+### Application flow
+The diagram below summarizes the runtime compilation part of the example.
+1. A number of variables are declared and defined to configure the program which will be compiled in runtime.
+2. The program is created using the above variables as parameters, along with the SAXPY kernel in string form.
+3. The properties of the first device (GPU) available are consulted to set the device architecture as (the only) compile option.
+4. The program is compiled using the previously mentioned compile options.
+5. If exists, the log generated during the compile process is printed to the standard output.
+6. The binary compiled from the program is stored as a vector of characters and the program object is destroyed.
+7. Begin the preparation for the launch of the kernel on the device. A number of constants are defined to control the problem details and the kernel launch parameters.
+8. The two input vectors, $x$ and $y$, are instantiated in host memory and filled with the increasing sequences $1, 2, 3, 4, ...$ and $2, 4, 6, 8, ...$, respectively.
+9. The necessary amount of device (GPU) memory is allocated and the elements of the input vectors are copied to the device memory.
+10. A HIP module corresponding to the compiled binary is loaded into the current context and the SAXPY kernel is extracted from it into a HIP function object.
+11. The kernel launch configuration options and its arguments are declared and defined.
+12. A trace message is printed to the standard output.
+13. The GPU kernel is then launched with the above mentioned options along with the constants defined previously.
+14. The results are copied back to host vector $y$.
+15. The previously allocated device memory is freed.
+16. The module is unloaded from the current context and freed.
+17. The first few elements of the result vector $y$ are printed to the standard output.
+
+![hiprtc.svg](hiprtc.svg)
+## Key APIs and Concepts
+- `hipGetDeviceProperties` extracts the properties of the desired device. In this example it is used to get the GPU architecture.
+- `hipModuleGetFunction` extracts a handle for a function with a certain name from a given module. Note that if no function with that name is present in the module this method will return an error.
+- `hipModuleLaunchKernel` queues the launch of the provided kernel on the device. This function normally presents an asynchronous behaviour (see `HIP_LAUNCH_BLOCKING`), i.e. a call to it may return before the device finishes the execution of the kernel. Its parameters are the following:
+    - The kernel to be launched.
+    - Number of blocks in the dimension X of kernel grid, i.e. the X component of grid size.
+    - Number of blocks in the dimension Y of kernel grid, i.e. the Y component of grid size.
+    - Number of blocks in the dimension Z of kernel grid, i.e. the Z component of grid size.
+    - Number of threads in the dimension X of each block, i.e. the X component of block size.
+    - Number of threads in the dimension Y of each block, i.e. the Y component of block size.
+    - Number of threads in the dimension Z of each block, i.e. the Z component of block size.
+    - Amount of dynamic shared memory that will be available to each workgroup, in bytes. Not used in this example.
+    - The device stream, on which the kernel should be dispatched. If 0 (or NULL), the NULL stream will be used. In this example the latter is used.
+    - Pointer to the arguments needed by the kernel. Note that this parameter is not yet implemented, and thus the _extra_ parameter (the last one described in this list) should be used to pass arguments to the kernel.
+    - Pointer to all extra arguments passed to the kernel. They must be in the memory layout and alignment expected by the kernel. The list of arguments must end with `HIP_LAUNCH_PARAM_END`.
+- `hipModuleLoadData` builds a module from a code (compiled binary) object residing in host memory and loads it into the current context. Note that in this example this function is called right after `hipMalloc`. This is due to the fact that, on CUDA, `hipModuleLoadData` will fail if it is not called after some runtime API call is done (as it will implicitly intialize a current context) or if there is not an explicit creation of a (current) context.
+- `hipModuleUnload` unloads the specified module from the current context and frees it.
+- `hiprtcCompileProgram` compiles the given program in runtime. Some compilation options may be passed as parameters to this function. In this example, the GPU architeture is the only compilation option.
+- `hiprtcCreateProgram` instantiates a runtime compilation program from the given parameters. Those are the following:
+    - The runtime compilation program object that will be set with the new instance.
+    - A pointer to the program source code.
+    - A pointer to the program name.
+    - The number of headers to be included.
+    - An array of pointers to the headers names.
+    - An array of pointers to the names to be included in the source program.
+
+    In this example the program is created including two header files to illustrate how to pass all of the above arguments to this function.
+- `hiprtcDestroyProgram` destroys an instance of a given runtime compilation program object.
+- `hiprtcGetProgramLog` extracts the char pointer to the log generated during the compilation of a given runtime compilation program.
+- `hiprtcGetProgramLogSize` returns the compilation log size of a given runtime compilation program, measured as number of characters.
+- `hiprtcGetCode` extracts the char pointer to the compilation binary in memory from a runtime compilation program object. This binary is needed to load the corresponding HIP module into the current context and extract from it the kernel(s) that will be executed on the GPU.
+- `hiprtcGetCodeSize` returns the size of the binary compiled of a given runtime compilation program, measured as number of characters.
+
+## Demonstrated API Calls
+
+### HIP runtime
+
+#### Device symbols
+- `threadIdx`, `blockIdx`, `blockDim`
+
+#### Host symbols
+- `hipFree`
+- `hipGetDeviceProperties`
+- `hipGetLastError`
+- `hipMalloc`
+- `hipMemcpy`
+- `hipMemcpyDeviceToHost`
+- `hipMemcpyHostToDevice`
+- `hipModuleGetFunction`
+- `hipModuleLaunchKernel`
+- `hipModuleLoadData`
+- `hipModuleUnload`
+- `hiprtcCompileProgram`
+- `hiprtcCreateProgram`
+- `hiprtcDestroyProgram`
+- `hiprtcGetCode`
+- `hiprtcGetCodeSize`
+- `hiprtcGetProgramLog`
+- `hiprtcGetProgramLogSize`
+- `HIP_LAUNCH_PARAM_BUFFER_POINTER`
+- `HIP_LAUNCH_PARAM_BUFFER_SIZE`
+- `HIP_LAUNCH_PARAM_END`
diff --git a/HIP-Basic/runtime_compilation/hiprtc.svg b/HIP-Basic/runtime_compilation/hiprtc.svg
new file mode 100644
index 00000000..15aa28dc
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/hiprtc.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1643px" height="362px" viewBox="-0.5 -0.5 1643 362"><defs/><g><rect x="1" y="1" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 2px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">const char* src</div></div></div></foreignObject><text x="121" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">const char* src</text></switch></g><rect x="561" y="1" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 562px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcProgram</span></div></div></div></div></div></foreignObject><text x="681" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcProgram</text></switch></g><rect x="1121" y="1" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 1122px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcProgram</span></div></div></div></div></div></foreignObject><text x="1241" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcProgram</text></switch></g><path d="M 301 1 L 501 1 L 521 21 L 501 41 L 301 41 L 281 21 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 282px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcCreateProgram</span></div></div></div></div></div></foreignObject><text x="401" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcCreateProgram</text></switch></g><path d="M 521 21 L 552.76 21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 558.76 21 L 550.76 25 L 552.76 21 L 550.76 17 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 241 21 L 272.76 21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 278.76 21 L 270.76 25 L 272.76 21 L 270.76 17 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 861 1 L 1061 1 L 1081 21 L 1061 41 L 861 41 L 841 21 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 842px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcCompileProgram</span></div></div></div></div></div></foreignObject><text x="961" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcCompileProgram</text></switch></g><path d="M 1081 21 L 1112.76 21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1118.76 21 L 1110.76 25 L 1112.76 21 L 1110.76 17 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 801 21 L 832.76 21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 838.76 21 L 830.76 25 L 832.76 21 L 830.76 17 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="1121" y="161" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 181px; margin-left: 1122px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">char* bin</div></div></div></foreignObject><text x="1241" y="186" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">char* bin</text></switch></g><path d="M 1141 81 L 1341 81 L 1361 101 L 1341 121 L 1141 121 L 1121 101 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 101px; margin-left: 1122px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcGetCode</span></div></div></div></div></div></foreignObject><text x="1241" y="106" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcGetCode</text></switch></g><path d="M 1241 41 L 1241 72.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1241 78.76 L 1237 70.76 L 1241 72.76 L 1245 70.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 1241 121 L 1241 152.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1241 158.76 L 1237 150.76 L 1241 152.76 L 1245 150.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 1361 21 L 1392.76 21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1398.76 21 L 1390.76 25 L 1392.76 21 L 1390.76 17 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 1421 1 L 1621 1 L 1641 21 L 1621 41 L 1421 41 L 1401 21 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 21px; margin-left: 1402px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hiprtcDestroyProgram</span></div></div></div></div></div></foreignObject><text x="1521" y="26" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hiprtcDestroyProgram</text></switch></g><path d="M 861 161 L 1061 161 L 1081 181 L 1061 201 L 861 201 L 841 181 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 181px; margin-left: 842px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipModuleLoadData</span></div></div></div></div></div></foreignObject><text x="961" y="186" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipModuleLoadData</text></switch></g><path d="M 1121 181 L 1089.24 181" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1083.24 181 L 1091.24 177 L 1089.24 181 L 1091.24 185 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="561" y="161" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 181px; margin-left: 562px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipModule_t</span></div></div></div></div></div></foreignObject><text x="681" y="186" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipModule_t</text></switch></g><path d="M 841 181 L 809.24 181" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 803.24 181 L 811.24 177 L 809.24 181 L 811.24 185 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 581 241 L 781 241 L 801 261 L 781 281 L 581 281 L 561 261 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 261px; margin-left: 562px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipModuleGetFunction</span></div></div></div></div></div></foreignObject><text x="681" y="266" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipModuleGetFunction</text></switch></g><rect x="281" y="241" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 261px; margin-left: 282px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipFunction_t</span></div></div></div></div></div></foreignObject><text x="401" y="266" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipFunction_t</text></switch></g><path d="M 301 321 L 501 321 L 521 341 L 501 361 L 301 361 L 281 341 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 341px; margin-left: 282px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipModuleLaunchKernel</span></div></div></div></div></div></foreignObject><text x="401" y="346" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipModuleLaunchKernel</text></switch></g><path d="M 401 281 L 401 312.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 401 318.76 L 397 310.76 L 401 312.76 L 405 310.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 301 161 L 501 161 L 521 181 L 501 201 L 301 201 L 281 181 Z" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 181px; margin-left: 282px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div style="font-weight: normal; font-size: 16px; line-height: 16px;"><div style="font-size: 16px;"><span style="font-size: 16px;">hipModuleUnload</span></div></div></div></div></div></foreignObject><text x="401" y="186" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">hipModuleUnload</text></switch></g><path d="M 561 181 L 529.24 181" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 523.24 181 L 531.24 177 L 529.24 181 L 531.24 185 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 561 261 L 529.24 261" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 523.24 261 L 531.24 257 L 529.24 261 L 531.24 265 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 681 201 L 681 232.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 681 238.76 L 677 230.76 L 681 232.76 L 685 230.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="841" y="241" width="240" height="40" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 261px; margin-left: 842px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 16px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">const char* kernel_name</div></div></div></foreignObject><text x="961" y="266" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="16px" text-anchor="middle">const char* kernel_name</text></switch></g><path d="M 841 261 L 809.24 261" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 803.24 261 L 811.24 257 L 809.24 261 L 811.24 265 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/HIP-Basic/runtime_compilation/main.hip b/HIP-Basic/runtime_compilation/main.hip
new file mode 100644
index 00000000..f979e420
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/main.hip
@@ -0,0 +1,215 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+// SAXPY kernel stored as a string
+static constexpr auto saxpy_kernel{
+    R"(
+#include "test_header.h"
+#include "test_header1.h"
+extern "C"
+__global__ void saxpy_kernel(const real a, const realptr d_x, realptr d_y, const unsigned int size)
+{
+    const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(global_idx < size)
+    {
+        d_y[global_idx] = a * d_x[global_idx] + d_y[global_idx];
+    }
+}
+)"};
+
+int main()
+{
+    // Program to be compiled in runtime.
+    hiprtcProgram prog;
+
+    // Vector containing example header names.
+    std::vector<const char*> header_names;
+    header_names.push_back("test_header.h");
+    header_names.push_back("test_header1.h");
+
+    // Vector containing example names to be included in the program.
+    std::vector<const char*> header_sources;
+    header_sources.push_back("#ifndef HIPRTC_TEST_HEADER_H\n#define HIPRTC_TEST_HEADER_H\ntypedef "
+                             "float real;\n#endif //HIPRTC_TEST_HEADER_H\n");
+    header_sources.push_back(
+        "#ifndef HIPRTC_TEST_HEADER1_H\n#define HIPRTC_TEST_HEADER1_H\ntypedef float* "
+        "realptr;\n#endif //HIPRTC_TEST_HEADER1_H\n");
+
+    // Create program.
+    hiprtcCreateProgram(&prog,
+                        saxpy_kernel,
+                        "saxpy_kernel.cu",
+                        header_sources.size(),
+                        header_sources.data(),
+                        header_names.data());
+
+    // Get device properties from the first device available.
+    hipDeviceProp_t        props;
+    constexpr unsigned int device_id = 0;
+    HIP_CHECK(hipGetDeviceProperties(&props, device_id));
+
+    // Obtain architecture's name from device properties and initialize array of compile options. When in CUDA we omit this option.
+    std::string sarg
+        = (props.gcnArchName[0]) ? std::string("--gpu-architecture=") + props.gcnArchName : "";
+    const char* options[]   = {sarg.c_str()};
+    const int   num_options = !sarg.empty();
+
+    // Compile program in runtime. Parameters are the program, number of options and array with options.
+    const hiprtcResult compile_result{hiprtcCompileProgram(prog, num_options, options)};
+
+    // Get the size of the log (possibly) generated during the compilation.
+    size_t log_size;
+    hiprtcGetProgramLogSize(prog, &log_size);
+
+    // If the compilation generated a log, print it.
+    if(log_size)
+    {
+        std::string log(log_size, '\0');
+        hiprtcGetProgramLog(prog, &log[0]);
+        std::cout << log << std::endl;
+    }
+
+    // If the compilation failed, say so and exit.
+    if(compile_result != HIPRTC_SUCCESS)
+    {
+        std::cout << "Error: compilation failed." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Get the size (in number of characters) of the binary compiled from the program.
+    size_t code_size;
+    hiprtcGetCodeSize(prog, &code_size);
+
+    // Store compiled binary as a vector of characters.
+    std::vector<char> code(code_size);
+    hiprtcGetCode(prog, code.data());
+
+    // Destroy program object.
+    hiprtcDestroyProgram(&prog);
+
+    // Now we launch the kernel on the device.
+
+    // Total number of float elements in each device vector.
+    constexpr unsigned int size = 4096;
+
+    // Total number of bytes to allocate for each device vector.
+    constexpr size_t size_bytes = size * sizeof(float);
+
+    // Number of threads per kernel block.
+    constexpr unsigned int block_size = 128;
+
+    // Number of blocks per kernel grid, calculated as ceil(size/block_size).
+    constexpr unsigned int grid_size = (size + block_size - 1) / block_size;
+
+    // Constant value 'a' to be used in the expression 'a*x+y'.
+    constexpr float a = 5.1f;
+
+    // Allocate x vector in host and fill it with increasing sequence 1, 2, 3, 4, ... .
+    std::vector<float> x(size);
+    std::iota(x.begin(), x.end(), 1.f);
+
+    // Allocate y vector in host and fill it with increasing sequence 2, 4, 6, 8, ... .
+    std::vector<float> y(x);
+    std::for_each(y.begin(), y.end(), [](float& f) { f = 2 * f; });
+
+    // Allocate vectors in device and copy from host to device memory.
+    float* d_x{};
+    float* d_y{};
+    HIP_CHECK(hipMalloc(&d_x, size_bytes));
+    HIP_CHECK(hipMalloc(&d_y, size_bytes));
+    HIP_CHECK(hipMemcpy(d_x, x.data(), size_bytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_y, y.data(), size_bytes, hipMemcpyHostToDevice));
+
+    // Load the HIP module corresponding to the compiled binary into the current context.
+    hipModule_t module;
+    HIP_CHECK(hipModuleLoadData(&module, code.data()));
+
+    // Extract SAXPY kernel from module into a function object.
+    hipFunction_t kernel;
+    HIP_CHECK(hipModuleGetFunction(&kernel, module, "saxpy_kernel"));
+
+    // Create and fill array with kernel arguments.
+    size_t offset    = 0;
+    char   args[256] = {};
+
+    *(reinterpret_cast<float*>(&args[offset])) = a;
+    offset += sizeof(a);
+    offset += 4; // aligning fix for CUDA executions
+    *(reinterpret_cast<float**>(&args[offset])) = d_x;
+    offset += sizeof(d_x);
+    *(reinterpret_cast<float**>(&args[offset])) = d_y;
+    offset += sizeof(d_y);
+    *(reinterpret_cast<unsigned int*>(&args[offset])) = size;
+    offset += sizeof(size);
+
+    // Create array with kernel arguments and its size.
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                      args,
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &offset,
+                      HIP_LAUNCH_PARAM_END};
+
+    std::cout << "Calculating y[i] = a * x[i] + y[i] over " << size << " elements." << std::endl;
+
+    // Launch the kernel on the NULL stream and with the above configuration.
+    HIP_CHECK(hipModuleLaunchKernel(kernel,
+                                    grid_size,
+                                    1,
+                                    1,
+                                    block_size,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    (void**)&config));
+
+    // Check if the kernel launch was successful.
+    HIP_CHECK(hipGetLastError())
+
+    // Copy results from device to host.
+    HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost));
+
+    // Free device memory.
+    HIP_CHECK(hipFree(d_x));
+    HIP_CHECK(hipFree(d_y));
+
+    // Unload module.
+    HIP_CHECK(hipModuleUnload(module));
+
+    // Print the first few elements of the results for validation.
+    constexpr size_t elements_to_print = 10;
+    std::cout << "First " << elements_to_print << " elements of the results: "
+              << format_range(y.begin(), y.begin() + elements_to_print) << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln
new file mode 100644
index 00000000..584dd56d
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "runtime_compilation_vs2019", "runtime_compilation_vs2019.vcxproj", "{E03790B7-B203-4504-BEF5-F4F061183642}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.ActiveCfg = Debug|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.Build.0 = Debug|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.ActiveCfg = Release|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj
new file mode 100644
index 00000000..5e0168be
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{E03790B7-B203-4504-BEF5-F4F061183642}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>runtime_compilation_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>hiprtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters
new file mode 100644
index 00000000..591e9f2c
--- /dev/null
+++ b/HIP-Basic/runtime_compilation/runtime_compilation_vs2019.vcxproj.filters
@@ -0,0 +1,27 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{2932a426-602b-4926-887e-27c50ba7eab7}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{ed043ec4-e8ac-4831-93f5-a58546ec7bea}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{0da954bd-e555-4454-b082-b68d10c753b9}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/HIP-Basic/saxpy/saxpy_vs2019.vcxproj b/HIP-Basic/saxpy/saxpy_vs2019.vcxproj
index 1844610a..d9602491 100644
--- a/HIP-Basic/saxpy/saxpy_vs2019.vcxproj
+++ b/HIP-Basic/saxpy/saxpy_vs2019.vcxproj
@@ -52,15 +52,17 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <TargetGPUArchitectures>gfx1030;gfx90c:xnack-</TargetGPUArchitectures>
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <TargetGPUArchitectures>gfx1030;gfx90c:xnack-</TargetGPUArchitectures>
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/HIP-Basic/shared_memory/.gitignore b/HIP-Basic/shared_memory/.gitignore
new file mode 100644
index 00000000..9c7163b7
--- /dev/null
+++ b/HIP-Basic/shared_memory/.gitignore
@@ -0,0 +1 @@
+hip_shared_memory
diff --git a/HIP-Basic/shared_memory/CMakeLists.txt b/HIP-Basic/shared_memory/CMakeLists.txt
new file mode 100644
index 00000000..49a91f20
--- /dev/null
+++ b/HIP-Basic/shared_memory/CMakeLists.txt
@@ -0,0 +1,59 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_shared_memory)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+set(GPU_RUNTIMES "HIP" "CUDA")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
+
+if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest
+add_test(${example_name} ${example_name})
+
+set(include_dirs "../../Common")
+
+# For examples targeting NVIDIA, include the HIP header directory.
+if(GPU_RUNTIME STREQUAL "CUDA")
+    list(APPEND include_dirs "${ROCM_ROOT}/include")
+endif()
+
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/shared_memory/Makefile b/HIP-Basic/shared_memory/Makefile
new file mode 100644
index 00000000..36a7d271
--- /dev/null
+++ b/HIP-Basic/shared_memory/Makefile
@@ -0,0 +1,54 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := hip_shared_memory
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME := HIP
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+	CXXFLAGS += -x cu
+	CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+else ifeq ($(GPU_RUNTIME), HIP)
+else
+	$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
+	$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/HIP-Basic/shared_memory/README.md b/HIP-Basic/shared_memory/README.md
new file mode 100644
index 00000000..9f05802f
--- /dev/null
+++ b/HIP-Basic/shared_memory/README.md
@@ -0,0 +1,47 @@
+# HIP-Basic Shared Memory Example
+
+## Description
+The shared memory is an on-chip type of memory that is visible to all the threads within the same block, allowing them to communicate by writing and reading data from the same memory space. However, some synchronization among the threads of the block is needed to ensure that all of them have written before trying to access the data. 
+
+When using the appropriate access pattern, this memory can provide much less latency than local or global memory (nearly as much as registers), making it a much better option in certain cases. If the size of the shared memory to be used is known at compile time, it can be explicitly specified and it is then known as static shared memory. 
+
+This example implements a simple matrix transpose kernel to showcase how to use static shared memory.
+
+### Application flow 
+1. A number of constants are defined for the kernel launch parameters.
+2. The input and output matrices are allocated and initialized in host memory.
+3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device.
+4. A trace message is printed to the standard output.
+5. The GPU kernel is then launched with the previously defined arguments. 
+6. The transposed matrix is copied back to host memory.
+7. All device memory is freed.
+8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output.
+
+## Key APIs and Concepts
+- `__shared__` is a variable declaration specifier necessary to allocate shared memory from the device. 
+- `__syncthreads` allows to synchronize all the threads within the same block. This synchronization barrier is used to ensure that every thread in a block have finished writing in shared memory before another threads in the block try to access that data.
+- `hipMalloc` allocates host device memory in global memory, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.
+- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU).
+- `hipGetLastError` gets the last error returned by any HIP runtime API call.
+- `hipFree` deallocates device memory allocated with `hipMalloc`.
+
+## Demonstrated API Calls
+
+### HIP runtime
+- `__global__`
+- `__shared__`
+
+#### Device symbols
+- `blockDim`
+- `blockIdx`
+- `threadIdx`
+- `__syncthreads`
+
+#### Host symbols
+- `hipFree`
+- `hipGetLastError`
+- `hipLaunchKernelGGL`
+- `hipMalloc`
+- `hipMemcpy`
+- `hipMemcpyDeviceToHost`
+- `hipMemcpyHostToDevice`
diff --git a/HIP-Basic/shared_memory/main.hip b/HIP-Basic/shared_memory/main.hip
new file mode 100644
index 00000000..9fa15f20
--- /dev/null
+++ b/HIP-Basic/shared_memory/main.hip
@@ -0,0 +1,160 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+/// \brief Transposes the matrix \p in and stores the result in \p out using static shared memory.
+template<const unsigned int Width = 64>
+__global__ void matrix_transpose_kernel(float* out, const float* in)
+{
+    // Allocate the necessary amount of shared memory to store the transpose of the matrix.
+    constexpr unsigned int size = Width * Width;
+    __shared__ float       shared_matrix_memory[size];
+
+    // Compute the row and column indexes of the matrix element that each thread is going
+    // to process.
+    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    // If not out of bounds, transpose element (x,y).
+    if(x < Width && y < Width)
+    {
+        // Store transposed element in shared memory.
+        shared_matrix_memory[y * Width + x] = in[x * Width + y];
+    }
+
+    // Syncronize threads so all writes are done before accessing shared memory again.
+    __syncthreads();
+
+    // If not out of bounds, transpose element (x,y).
+    if(x < Width && y < Width)
+    {
+        // Copy transposed element from shared memory to global memory.
+        out[y * Width + x] = shared_matrix_memory[y * Width + x];
+    }
+}
+
+// CPU implementation of matrix transpose.
+std::vector<float> expected_matrix_transpose(const std::vector<float>& input,
+                                             const unsigned int        width)
+{
+    std::vector<float> output(width * width);
+    for(unsigned int j = 0; j < width; j++)
+    {
+        for(unsigned int i = 0; i < width; i++)
+        {
+            output[i * width + j] = input[j * width + i];
+        }
+    }
+    return output;
+}
+
+int main()
+{
+    // Number of rows and columns, total number of elements and size in bytes of the matrix
+    // to be transposed.
+    constexpr unsigned int width      = 64;
+    constexpr unsigned int size       = width * width;
+    constexpr unsigned int size_bytes = size * sizeof(float);
+
+    // Number of threads in each dimension of the kernel block.
+    constexpr unsigned int block_size = 4;
+
+    // Number of blocks in each dimension of the grid. Calculated as ceil(width/block_size).
+    constexpr unsigned int grid_size = (width + block_size - 1) / block_size;
+
+    // Block and grid sizes in 2D.
+    constexpr dim3 block_dim(block_size, block_size);
+    constexpr dim3 grid_dim(grid_size, grid_size);
+
+    // Allocate host input matrix and initialize with increasing sequence 10, 20, 30, ....
+    std::vector<float> matrix(size);
+    std::iota(matrix.begin(), matrix.end(), 1.f);
+    std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; });
+
+    // Allocate matrix to store the results of the kernel execution.
+    std::vector<float> transposed_matrix(size);
+
+    // Allocate input and output matrices on device.
+    float* d_matrix{};
+    float* d_transposed_matrix{};
+    HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
+    HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
+
+    // Copy input matrix data from host to device.
+    HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice));
+
+    // Print trace message.
+    std::cout << "Computing matrix transpose." << std::endl;
+
+    // Launch kernel on the default stream. Passing kernel arguments at the end of the
+    // hipLaunchKernelGGL function call.
+    hipLaunchKernelGGL(matrix_transpose_kernel<width>,
+                       grid_dim,
+                       block_dim,
+                       0,
+                       hipStreamDefault,
+                       d_transposed_matrix,
+                       d_matrix);
+
+    // Check if the kernel launch was successful.
+    HIP_CHECK(hipGetLastError());
+
+    // Copy results from device to host.
+    HIP_CHECK(hipMemcpy(transposed_matrix.data(),
+                        d_transposed_matrix,
+                        size_bytes,
+                        hipMemcpyDeviceToHost));
+
+    // Free device memory.
+    HIP_CHECK(hipFree(d_matrix));
+    HIP_CHECK(hipFree(d_transposed_matrix));
+
+    // Calculate expected transposed matrix with the CPU version of the kernel.
+    std::vector<float> expected_transposed_matrix = expected_matrix_transpose(matrix, width);
+
+    // Validate results comparing with expected transposed matrix.
+    unsigned int    errors = 0;
+    constexpr float eps    = 1.0E-6;
+    std::cout << "Validating transposed matrix." << std::endl;
+    for(unsigned int i = 0; i < size; i++)
+    {
+        errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps);
+    }
+
+    if(errors)
+    {
+        std::cout << "Validation failed with " << errors << " errors." << std::endl;
+        return error_exit_code;
+    }
+    else
+    {
+        std::cout << "Validation passed." << std::endl;
+    }
+}
diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.sln b/HIP-Basic/shared_memory/shared_memory_vs2019.sln
new file mode 100644
index 00000000..f256b16c
--- /dev/null
+++ b/HIP-Basic/shared_memory/shared_memory_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "shared_memory_vs2019", "shared_memory_vs2019.vcxproj", "{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.ActiveCfg = Debug|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.Build.0 = Debug|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.ActiveCfg = Release|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj
new file mode 100644
index 00000000..8f74a594
--- /dev/null
+++ b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>shared_memory_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters
new file mode 100644
index 00000000..591e9f2c
--- /dev/null
+++ b/HIP-Basic/shared_memory/shared_memory_vs2019.vcxproj.filters
@@ -0,0 +1,27 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{2932a426-602b-4926-887e-27c50ba7eab7}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{ed043ec4-e8ac-4831-93f5-a58546ec7bea}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{0da954bd-e555-4454-b082-b68d10c753b9}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/HIP-Basic/streams/streams_vs2019.vcxproj b/HIP-Basic/streams/streams_vs2019.vcxproj
index 2b0e8932..50d5b2d3 100644
--- a/HIP-Basic/streams/streams_vs2019.vcxproj
+++ b/HIP-Basic/streams/streams_vs2019.vcxproj
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -67,6 +69,7 @@
       <WarningLevel>Level1</WarningLevel>
       <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -80,6 +83,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -92,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/HIP-Basic/warp_shuffle/.gitignore b/HIP-Basic/warp_shuffle/.gitignore
new file mode 100644
index 00000000..561ef15b
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/.gitignore
@@ -0,0 +1 @@
+hip_warp_shuffle
diff --git a/HIP-Basic/warp_shuffle/CMakeLists.txt b/HIP-Basic/warp_shuffle/CMakeLists.txt
new file mode 100644
index 00000000..f8f8b666
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/CMakeLists.txt
@@ -0,0 +1,58 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_warp_shuffle)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+set(GPU_RUNTIME "HIP" CACHE STRING "Switches between HIP and CUDA")
+set(GPU_RUNTIMES "HIP" "CUDA")
+set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES})
+
+if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES)
+    set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP or CUDA.")
+    message(FATAL_ERROR ${ERROR_MESSAGE})
+endif()
+
+enable_language(${GPU_RUNTIME})
+set(CMAKE_${GPU_RUNTIME}_STANDARD 17)
+set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF)
+set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON)
+
+set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
+if(NOT CMAKE_PREFIX_PATH)
+    set(CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+endif()
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest.
+add_test(${example_name} ${example_name})
+
+set(include_dirs "../../Common")
+# For examples targeting NVIDIA, include the HIP header directory.
+if(GPU_RUNTIME STREQUAL "CUDA")
+    list(APPEND include_dirs "${ROCM_ROOT}/include")
+endif()
+
+target_include_directories(${example_name} PRIVATE ${include_dirs})
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${GPU_RUNTIME})
diff --git a/HIP-Basic/warp_shuffle/Makefile b/HIP-Basic/warp_shuffle/Makefile
new file mode 100644
index 00000000..1143e9c4
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/Makefile
@@ -0,0 +1,54 @@
+# MIT License
+#
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := hip_warp_shuffle
+COMMON_INCLUDE_DIR := ../../Common
+GPU_RUNTIME := HIP
+
+# HIP variables
+ROCM_INSTALL_DIR := /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD  := c++17
+CXXFLAGS := -std=$(CXX_STD)
+CPPFLAGS := -I $(COMMON_INCLUDE_DIR)
+LDFLAGS	 :=
+LDLIBS	 :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+	CXXFLAGS += -x cu
+	CPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+else ifeq ($(GPU_RUNTIME), HIP)
+else
+	$(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+$(EXAMPLE): main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
+	$(HIPCXX) $< $(CXXFLAGS) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) -o $@
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/HIP-Basic/warp_shuffle/README.md b/HIP-Basic/warp_shuffle/README.md
new file mode 100644
index 00000000..691857d3
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/README.md
@@ -0,0 +1,53 @@
+# HIP-Basic Warp Shuffle Example
+
+## Description
+Kernel code for a particular block is executed in groups of threads known as a _wavefronts_ (AMD) or _warps_ (NVIDIA). Each block is is divided into as many warps as the block's size allows. If the block size is less than the warp size, then part of the warp just stays idle (as happens in this example). AMD GPUs use 64 threads per wavefront for architectures prior to RDNA™ 1. RDNA architectures support both 32 and 64 wavefront sizes.
+
+Warps are executed in _lockstep_, i.e. all the threads in each warp execute the same instruction at the same time but with different data. This type of parallel processing is also known as Single Instruction, Multiple Data (SIMD). A block contains several warps and the warp size is dependent on the architecture, but the block size is not. Blocks and warps also differ in the way they are executed, and thus they may provide different results when used in the same piece of code. For instance, the kernel code of this example would not work as it is with block execution and shared memory access e.g. because some synchronization would be needed to ensure that every thread has written its correspondent value before trying to access it.
+
+Higher performance in the execution of kernels can be achieved with explicit warp-level programming. This can be done by using some collective operations, known as _warp shuffles_, that allow exchanging data between threads in the same warp without the need for shared memory. This exchange occurs simultaneously for all the active threads in the warp.
+
+This example showcases how to use the above-mentioned operations by implementing a simple matrix transpose kernel.
+
+### Application flow
+1. A number of constants are defined for the kernel launch parameters.
+2. The input and output matrices are allocated and initialized in host memory.
+3. The necessary amount of device memory for the input and output matrices is allocated and the input data is copied to the device.
+4. A trace message is printed to the standard output.
+5. The GPU kernel is then launched with the previously defined arguments.
+6. The transposed matrix is copied back to host memory.
+7. All device memory is freed.
+8. The expected transposed matrix is calculated with a CPU version of the transpose kernel and the transposed matrix obtained from the kernel execution is then compared with it. The result of the comparison is printed to the standard output.
+
+## Key APIs and Concepts
+Warp shuffle is a warp-level primitive that allows for the communication between the threads of a warp. Below is a simple example that shows how the value of the thread with index 2 is copied to all other threads within the warp.
+![warp_shuffle_simple.svg](warp_shuffle_simple.svg)
+
+`__shfl(var, src_lane, width = warp_size)` copies the value of a `var` from the thread `src_lane` within the warp. This operation admits a third parameter (not used in this example), `width`, defaulted to the warp size value and which allows restricting the number of threads of the warp from which values are read. Values are copied from threads with an ID in the range $[0, width-1]$. If the ID of the thread specified in the call to `__shfl` is out of that range, then the thread accessed is the one with that ID modulo `width`. The `src_lane` may also vary per thread, as shown below.
+
+![warp_shuffle.svg](warp_shuffle.svg)
+
+- `hipGetDeviceProperties` gets the properties of the specified device. In this example, it is used to get the warp size of the device (GPU) used.
+- `hipMalloc` allocates memory in the global memory of the device, and with `hipMemcpy` data bytes can be transferred from host to device (using `hipMemcpyHostToDevice`) or from device to host (using `hipMemcpyDeviceToHost`), among others.- `hipLaunchKernelGGL` queues the execution of a kernel on a device (GPU).
+- `hipGetLastError` gets the last error returned by any HIP runtime API call.
+- `hipFree` deallocates device memory allocated with `hipMalloc`.
+
+## Demonstrated API Calls
+
+### HIP runtime
+
+#### Device symbols
+- `__global__`
+- `threadIdx`
+- `__shfl`
+
+#### Host symbols
+- `hipFree`
+- `hipGetDeviceProperties`
+- `hipGetLastError`
+- `hipLaunchKernelGGL`
+- `hipMalloc`
+- `hipMemcpy`
+- `hipMemcpyDeviceToHost`
+- `hipMemcpyHostToDevice`
+- `hipStreamDefault`
diff --git a/HIP-Basic/warp_shuffle/main.hip b/HIP-Basic/warp_shuffle/main.hip
new file mode 100644
index 00000000..1174d924
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/main.hip
@@ -0,0 +1,156 @@
+// MIT License
+//
+// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+/// \brief Transposes the matrix \p in and stores the result in \p out using warp shuffle operations.
+__global__ void matrix_transpose_kernel(float* out, const float* in, const unsigned int width)
+{
+    // Compute the row and column indexes of the matrix element that each thread is going
+    // to process. Since in this example there is only one block, the indexes are
+    // precisely the thread's ID in each dimension.
+    const unsigned int x = threadIdx.x;
+    const unsigned int y = threadIdx.y;
+
+    // If not out of bounds, transpose element.
+    if(x < width && y < width)
+    {
+        // Read element from global memory. Each thread in the warp is reading the element that
+        // the thread with global id x * width + y will transpose.
+        const float val = in[y * width + x];
+
+        // Transpose element reading it from the correspondent thread with a shuffle operation (__shfl).
+        // __shfl does not require all threads to be active, so it can be inside the if block.
+        // Note that, since the matrix in this example has less elements than the warp size value,
+        // the ID within the warp of each thread matches its global ID.
+        out[x * width + y] = __shfl(val, y * width + x);
+    }
+}
+
+/// \brief CPU implementation of matrix transpose.
+std::vector<float> expected_matrix_transpose(const std::vector<float>& input,
+                                             const unsigned int        width)
+{
+    std::vector<float> output(width * width);
+    for(unsigned int j = 0; j < width; j++)
+    {
+        for(unsigned int i = 0; i < width; i++)
+        {
+            output[i * width + j] = input[j * width + i];
+        }
+    }
+    return output;
+}
+
+int main()
+{
+    // Number of rows and columns, total number of elements and size in bytes of the matrix
+    // to be transposed.
+    constexpr unsigned int width      = 4;
+    constexpr unsigned int size       = width * width;
+    constexpr unsigned int size_bytes = size * sizeof(float);
+
+    // Get device's warp size.
+    hipDeviceProp_t props;
+    HIP_CHECK(hipGetDeviceProperties(&props, 0 /*device ID*/));
+
+    // To guarantee the correct behaviour of the program, keep total number of matrix elements
+    // below (or equal to) warp size.
+    assert(size <= props.warpSize
+           && "Matrix has more elements than architecture's warp size value.");
+
+    // Block (2D) and grid sizes. Note that in this example we have only 1 block (and 1 warp).
+    constexpr dim3 block_dim(width, width);
+    constexpr dim3 grid_dim(1);
+
+    // Allocate host input matrix and initialize with increasing sequence 10, 20, 30, ....
+    std::vector<float> matrix(size);
+    std::iota(matrix.begin(), matrix.end(), 1.f);
+    std::for_each(matrix.begin(), matrix.end(), [](float& f) { f = 10.f * f; });
+
+    // Allocate matrix to store the results of the kernel execution.
+    std::vector<float> transposed_matrix(size);
+
+    // Allocate input and output matrices on device.
+    float* d_matrix{};
+    float* d_transposed_matrix{};
+    HIP_CHECK(hipMalloc(&d_matrix, size_bytes));
+    HIP_CHECK(hipMalloc(&d_transposed_matrix, size_bytes));
+
+    // Copy input matrix data from host to device.
+    HIP_CHECK(hipMemcpy(d_matrix, matrix.data(), size_bytes, hipMemcpyHostToDevice));
+
+    // Print trace message.
+    std::cout << "Computing matrix transpose." << std::endl;
+
+    // Lauching kernel from host
+    hipLaunchKernelGGL(matrix_transpose_kernel,
+                       grid_dim,
+                       block_dim,
+                       0,
+                       hipStreamDefault,
+                       d_transposed_matrix,
+                       d_matrix,
+                       width);
+
+    // Check if the kernel launch was successful.
+    HIP_CHECK(hipGetLastError());
+
+    // Copy results from device to host.
+    HIP_CHECK(hipMemcpy(transposed_matrix.data(),
+                        d_transposed_matrix,
+                        size_bytes,
+                        hipMemcpyDeviceToHost));
+
+    // Free device memory.
+    HIP_CHECK(hipFree(d_matrix));
+    HIP_CHECK(hipFree(d_transposed_matrix));
+
+    // Calculate expected transposed matrix with the CPU version of the kernel.
+    std::vector<float> expected_transposed_matrix = expected_matrix_transpose(matrix, width);
+
+    // Validate results comparing with expected transposed matrix.
+    unsigned int    errors = 0;
+    constexpr float eps    = 1.0E-6;
+    std::cout << "Validating transposed matrix." << std::endl;
+    for(unsigned int i = 0; i < size; i++)
+    {
+        errors += (std::fabs(transposed_matrix[i] - expected_transposed_matrix[i]) > eps);
+    }
+
+    if(errors)
+    {
+        std::cout << "Validation failed with " << errors << " errors." << std::endl;
+        return error_exit_code;
+    }
+    else
+    {
+        std::cout << "Validation passed." << std::endl;
+    }
+}
diff --git a/HIP-Basic/warp_shuffle/warp_shuffle.svg b/HIP-Basic/warp_shuffle/warp_shuffle.svg
new file mode 100644
index 00000000..d493ffcc
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/warp_shuffle.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1161px" height="302px" viewBox="-0.5 -0.5 1161 302"><defs/><g><rect x="1000" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 1001px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">xn</div></div></div></foreignObject><text x="1040" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">xn</text></switch></g><rect x="610" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 611px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">0</div></div></div></foreignObject><text x="640" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">0</text></switch></g><rect x="690" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 691px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">1</div></div></div></foreignObject><text x="720" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">1</text></switch></g><rect x="770" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 771px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">2</div></div></div></foreignObject><text x="800" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">2</text></switch></g><rect x="850" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 851px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">3</div></div></div></foreignObject><text x="880" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">3</text></switch></g><rect x="920" y="0" width="240" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 20px; margin-left: 921px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">warp_size - 1</div></div></div></foreignObject><text x="1040" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">warp_size - 1</text></switch></g><rect x="500" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">lane</div></div></div></foreignObject><text x="530" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">lane</text></switch></g><rect x="0" y="160" width="560" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 558px; height: 1px; padding-top: 180px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">ret = __shfl(var, thread_idx % 2)</div></div></div></foreignObject><text x="280" y="188" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">ret = __shfl(var, thread_idx % 2)</text></switch></g><rect x="500" y="80" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 100px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">var</div></div></div></foreignObject><text x="530" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">var</text></switch></g><rect x="840" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 841px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x1</div></div></div></foreignObject><text x="880" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x1</text></switch></g><rect x="1000" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 1001px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x1</div></div></div></foreignObject><text x="1040" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x1</text></switch></g><path d="M 640 140 L 640 209.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 640 217.76 L 636 209.76 L 644 209.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 720 140 L 720 209.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 720 217.76 L 716 209.76 L 724 209.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 640 140 L 790.84 215.42" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 798 219 L 789.06 219 L 792.63 211.84 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 720 140 L 870.84 215.42" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 878 219 L 869.06 219 L 872.63 211.84 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 720 140 L 1030.07 217.52" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1037.83 219.46 L 1029.1 221.4 L 1031.04 213.64 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="500" y="240" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 260px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">ret</div></div></div></foreignObject><text x="530" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">ret</text></switch></g><rect x="600" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 601px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x0</div></div></div></foreignObject><text x="640" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x0</text></switch></g><rect x="600" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 601px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x0</div></div></div></foreignObject><text x="640" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x0</text></switch></g><rect x="680" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 681px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x1</div></div></div></foreignObject><text x="720" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x1</text></switch></g><rect x="680" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 681px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x1</div></div></div></foreignObject><text x="720" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x1</text></switch></g><rect x="760" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 761px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x0</div></div></div></foreignObject><text x="800" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x0</text></switch></g><rect x="760" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 761px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="800" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="840" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 841px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x3</div></div></div></foreignObject><text x="880" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x3</text></switch></g><rect x="930" y="240" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 260px; margin-left: 931px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">...</div></div></div></foreignObject><text x="960" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">...</text></switch></g><rect x="930" y="80" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 100px; margin-left: 931px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">...</div></div></div></foreignObject><text x="960" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">...</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg b/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg
new file mode 100644
index 00000000..87183fad
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/warp_shuffle_simple.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1161px" height="302px" viewBox="-0.5 -0.5 1161 302"><defs/><g><rect x="1000" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 1001px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">xn</div></div></div></foreignObject><text x="1040" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">xn</text></switch></g><rect x="610" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 611px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">0</div></div></div></foreignObject><text x="640" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">0</text></switch></g><rect x="690" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 691px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">1</div></div></div></foreignObject><text x="720" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">1</text></switch></g><rect x="770" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 771px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">2</div></div></div></foreignObject><text x="800" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">2</text></switch></g><rect x="850" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 851px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">3</div></div></div></foreignObject><text x="880" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">3</text></switch></g><rect x="920" y="0" width="240" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 238px; height: 1px; padding-top: 20px; margin-left: 921px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">warp_size - 1</div></div></div></foreignObject><text x="1040" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">warp_size - 1</text></switch></g><rect x="500" y="0" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 20px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">lane</div></div></div></foreignObject><text x="530" y="28" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">lane</text></switch></g><rect x="0" y="160" width="560" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-end; width: 558px; height: 1px; padding-top: 180px; margin-left: 0px;"><div style="box-sizing: border-box; font-size: 0px; text-align: right;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">ret = __shfl(var, 2)</div></div></div></foreignObject><text x="558" y="188" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="end">ret = __shfl(var, 2)</text></switch></g><rect x="500" y="80" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 100px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">var</div></div></div></foreignObject><text x="530" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">var</text></switch></g><rect x="840" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 841px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="880" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="1000" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 1001px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="1040" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><path d="M 800 140 L 649.16 215.42" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 642 219 L 647.37 211.84 L 650.94 219 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 140 L 727.24 212.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 721.58 218.42 L 724.41 209.93 L 730.07 215.59 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 140 L 800 209.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 800 217.76 L 796 209.76 L 804 209.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 140 L 872.76 212.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 878.42 218.42 L 869.93 215.59 L 875.59 209.93 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 140 L 1030.29 216.76" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 1037.88 219.29 L 1029.02 220.56 L 1031.55 212.97 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="500" y="240" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 260px; margin-left: 501px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">ret</div></div></div></foreignObject><text x="530" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">ret</text></switch></g><rect x="600" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 601px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="640" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="600" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 601px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x0</div></div></div></foreignObject><text x="640" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x0</text></switch></g><rect x="680" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 681px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x1</div></div></div></foreignObject><text x="720" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x1</text></switch></g><rect x="680" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 681px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="720" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="760" y="220" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 260px; margin-left: 761px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="800" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="760" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 761px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x2</div></div></div></foreignObject><text x="800" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x2</text></switch></g><rect x="840" y="60" width="80" height="80" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 100px; margin-left: 841px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">x3</div></div></div></foreignObject><text x="880" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">x3</text></switch></g><rect x="930" y="240" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 260px; margin-left: 931px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">...</div></div></div></foreignObject><text x="960" y="268" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">...</text></switch></g><rect x="930" y="80" width="60" height="40" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 100px; margin-left: 931px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 28px; font-family: Courier New; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">...</div></div></div></foreignObject><text x="960" y="108" fill="rgb(0, 0, 0)" font-family="Courier New" font-size="28px" text-anchor="middle">...</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln
new file mode 100644
index 00000000..164f344b
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32630.194
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_shuffle_vs2019", "warp_shuffle_vs2019.vcxproj", "{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.ActiveCfg = Debug|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.Build.0 = Debug|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.ActiveCfg = Release|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D7C4B290-7C93-4D26-85D9-364F6A448EE0}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj
new file mode 100644
index 00000000..c56ab539
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>warp_shuffle_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters
new file mode 100644
index 00000000..591e9f2c
--- /dev/null
+++ b/HIP-Basic/warp_shuffle/warp_shuffle_vs2019.vcxproj.filters
@@ -0,0 +1,27 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{2932a426-602b-4926-887e-27c50ba7eab7}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx;hip;cu</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{ed043ec4-e8ac-4831-93f5-a58546ec7bea}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd;cuh</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{0da954bd-e555-4454-b082-b68d10c753b9}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\Common\example_utils.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj b/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj
index dbe7b184..25cd7210 100644
--- a/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj
+++ b/Libraries/exampleLibraryTemplate/example_template/example_template_vs2019.vcxproj
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>example_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>example_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj b/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj
index 72d44d3b..d3a64fb7 100644
--- a/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj
+++ b/Libraries/rocPRIM/block_sum/block_sum_vs2019.vcxproj
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>rocprim_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>rocprim_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj b/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj
index 3a4abace..584b7d7c 100644
--- a/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj
+++ b/Libraries/rocPRIM/device_sum/device_sum_vs2019.vcxproj
@@ -20,7 +20,7 @@
     <VCProjectVersion>15.0</VCProjectVersion>
     <ProjectGuid>{E71DB5FB-A1C4-4BB4-8B46-0037C32C885E}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>example_template_vs2019</RootNamespace>
+    <RootNamespace>device_sum_vs2019</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>rocprim_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>rocprim_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Libraries/rocRAND/simple_distributions_cpp/Makefile b/Libraries/rocRAND/simple_distributions_cpp/Makefile
index 3956c5f6..f573aea4 100644
--- a/Libraries/rocRAND/simple_distributions_cpp/Makefile
+++ b/Libraries/rocRAND/simple_distributions_cpp/Makefile
@@ -26,7 +26,7 @@ CUDACXX = $(CUDA_INSTALL_DIR)/bin/nvcc
 CXX_STD = c++17
 COMMON_INCLUDE_DIR = ../../../Common
 
-rocrand_simple_distributions_cpp: main.cpp argument_parsing.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
+rocrand_simple_distributions_cpp: main.cpp $(COMMON_INCLUDE_DIR)/cmdparser.hpp $(COMMON_INCLUDE_DIR)/example_utils.hpp
 ifeq ($(GPU_RUNTIME), CUDA)
 	$(CUDACXX) $< -std=$(CXX_STD) -isystem $(ROCM_INSTALL_DIR)/include -isystem $(CUDA_INSTALL_DIR)/include -I $(COMMON_INCLUDE_DIR) -L $(ROCM_INSTALL_DIR)/lib -L $(CUDA_INSTALL_DIR)/lib64 -lrocrand -lcudart -o $@ -D__HIP_PLATFORM_NVIDIA__ -x cu
 else
diff --git a/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp b/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp
deleted file mode 100644
index 4cb0f65a..00000000
--- a/Libraries/rocRAND/simple_distributions_cpp/argument_parsing.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-// MIT License
-//
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#ifndef SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP
-#define SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP
-
-#include <algorithm>
-#include <charconv>
-#include <iostream>
-#include <optional>
-#include <string_view>
-
-// Needed for the 's' suffix of `std::string` literals.
-using namespace std::string_literals;
-
-/// \brief The random distribution kind selected on the command line.
-enum class Distribution
-{
-    uniform_int,
-    uniform_real,
-    normal,
-    poisson
-};
-
-/// \brief The set of arguments parsed from the command line.
-struct CliArguments
-{
-    int          device_id_;
-    Distribution distribution_;
-    size_t       size_;
-    bool         print_;
-};
-
-/// \brief Operator overload to simply print a \p CliArguments instance.
-std::ostream& operator<<(std::ostream& os, const CliArguments& cli_args)
-{
-    // An immediately-invoked lambda expression selects the name of the distribution.
-    const std::string_view distribution_name = [&]()
-    {
-        switch(cli_args.distribution_)
-        {
-            case Distribution::uniform_int: return "uniform_int";
-            case Distribution::uniform_real: return "uniform_real";
-            case Distribution::normal: return "normal";
-            case Distribution::poisson: return "poisson";
-            default: return "unknown";
-        }
-    }();
-
-    // Printing the fields to the `std::ostream` object.
-    return os << "Selected device id: " << cli_args.device_id_
-              << "\nSelected distribution: " << distribution_name
-              << "\nSelected size: " << cli_args.size_ << "\nPrinting results: " << std::boolalpha
-              << cli_args.print_;
-}
-
-/// \brief Converts a \p std::string_view to integral type \p T.
-// Throws an exception with an error message if the conversion is unsuccessful.
-template<typename T>
-T parse_integral_arg(const std::string_view arg_value)
-{
-    T value;
-    // Try to convert the string_view to an integral type. If successful, the value is written to
-    // the variable `value`
-    const auto conversion_result
-        = std::from_chars(arg_value.data(), arg_value.data() + arg_value.size(), value);
-    // The default constructed `std::errc` stands for successful conversion.
-    if(conversion_result.ec != std::errc{})
-    {
-        throw std::runtime_error(
-            "Could not convert argument \""s.append(arg_value).append("\" to an integral value"));
-    }
-    return value;
-}
-
-/// \brief Parses an \p std::string_view to a \p Distribution.
-/// Throws an exception with an error message if the conversion is unsuccessful.
-Distribution parse_distribution_arg(const std::string_view distribution_arg)
-{
-    if(distribution_arg == "uniform_int")
-    {
-        return Distribution::uniform_int;
-    }
-    if(distribution_arg == "uniform_real")
-    {
-        return Distribution::uniform_real;
-    }
-    if(distribution_arg == "normal")
-    {
-        return Distribution::normal;
-    }
-    if(distribution_arg == "poisson")
-    {
-        return Distribution::poisson;
-    }
-    throw std::runtime_error(
-        "Argument \""s.append(distribution_arg).append("\" is not a valid distribution"));
-}
-
-/// \brief Parses the array of command line arguments to parameters consumed by the rest
-/// of the program. \p argc must be set to the size of the \p argv array. Each pointer in
-/// the \p argv array must point to a valid null-terminated string containing the argument.
-CliArguments parse_args(const int argc, const char** argv)
-{
-    // Pointers fulfill the random access iterator traits, thereby can be used with the
-    // standard algorithms.
-    const char** argv_end = argv + argc;
-
-    // This local function searches for `arg_name` in the argument array and returns true if found.
-    const auto find_argument = [&](const std::string_view arg_name)
-    {
-        const auto arg_name_it = std::find(argv, argv_end, arg_name);
-        return arg_name_it != argv_end;
-    };
-
-    // This local function searches for `arg_name` in the argument array. If found, it returns a pointer
-    // to the next argument -- that is assumed to be the provided value. Otherwise returns a null optional.
-    // If the found argument is the last one, an exception with an error message is thrown.
-    const auto find_argument_value
-        = [&](const std::string_view arg_name) -> std::optional<std::string_view>
-    {
-        const auto arg_name_it = std::find(argv, argv_end, arg_name);
-        if(arg_name_it == argv_end)
-        {
-            return std::nullopt;
-        }
-        // std::next returns the iterator copied and advanced by one
-        const auto arg_value_it = std::next(arg_name_it);
-        if(arg_value_it == argv_end)
-        {
-            throw std::runtime_error("Value for argument is not supplied: "s.append(arg_name));
-        }
-        return std::make_optional(*arg_value_it);
-    };
-
-    // The options below need provided values, thereby `find_argument_value` is used.
-    const auto device_arg       = find_argument_value("--device").value_or("0");
-    const auto distribution_arg = find_argument_value("--distribution").value_or("uniform_int");
-    const auto size_arg         = find_argument_value("--size").value_or("10000000");
-
-    // The option below is just a flag. Its existence is checked by `find_argument`.
-    const bool print_arg = find_argument("--print");
-
-    // Parse the arguments read to the corresponding type and return.
-    return {parse_integral_arg<int>(device_arg),
-            parse_distribution_arg(distribution_arg),
-            parse_integral_arg<size_t>(size_arg),
-            print_arg};
-}
-
-constexpr std::string_view cli_usage_message
-    = "Usage: simple_distributions_cpp [--device <device_id>] [--distribution "
-      "{uniform_int|uniform_real|normal|poisson}] [--size <size>] [--print]";
-
-#endif // SIMPLE_DISTRIBUTIONS_CPP_ARGUMENT_PARSING_HPP
diff --git a/Libraries/rocRAND/simple_distributions_cpp/main.cpp b/Libraries/rocRAND/simple_distributions_cpp/main.cpp
index 1f370ec8..0864c765 100644
--- a/Libraries/rocRAND/simple_distributions_cpp/main.cpp
+++ b/Libraries/rocRAND/simple_distributions_cpp/main.cpp
@@ -20,21 +20,22 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
-#include <chrono>
-#include <iostream>
-#include <random>
-#include <string_view>
-#include <vector>
+#include "cmdparser.hpp"
+#include "example_utils.hpp"
 
 #include <hip/hip_runtime.h>
+
 // Workaround for ROCm on Windows not including `__half` definitions, in a host compiler.
 #if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP__) && (defined(WIN32) || defined(_WIN32))
     #include <hip/amd_detail/hip_fp16_gcc.h>
 #endif
 #include <rocrand/rocrand.hpp>
 
-#include "argument_parsing.hpp"
-#include "example_utils.hpp"
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <string_view>
+#include <vector>
 
 // An anonymous namespace sets static linkage to its contents.
 // This means that the contained function definitions will only be visible
@@ -42,6 +43,15 @@
 namespace
 {
 
+/// \brief The random distribution kind selected on the command line.
+enum class Distribution
+{
+    uniform_int,
+    uniform_real,
+    normal,
+    poisson
+};
+
 /// \brief Selects the device (GPU) with the provided ID. If it cannot be selected
 /// (e.g. a non-existent device ID is passed), an exception is thrown.
 /// Otherwise, the name of the device is queried and printed to the standard output.
@@ -184,32 +194,90 @@ void dispatch_distribution_type(const Distribution dist, const size_t size, cons
     }
 }
 
+void configure_parser(cli::Parser& parser)
+{
+    // Default parameters
+    parser.set_optional<int>("device", "device", 0,
+                             "Device Id"); // Default Device 0
+    parser.set_optional<std::string>("distribution",
+                                     "distribution",
+                                     "uniform_int",
+                                     "rocRAND distribution"); // Default "uniform_int"
+    parser.set_optional<size_t>("size", "size", 10000000,
+                                "Problem size"); // Default 10000000
+
+    parser.set_optional<bool>(
+        "print",
+        "print",
+        0,
+        "Toggle printing on or off. This is a boolean argument and takes no value. If it is "
+        "provided the value is set to \"on\""); // Default "off"
+}
+
+Distribution get_distribution(std::string distribution_arg)
+{
+    Distribution distribution_enum;
+    if(distribution_arg == "uniform_int")
+    {
+        distribution_enum = Distribution::uniform_int;
+    }
+    else if(distribution_arg == "uniform_real")
+    {
+        distribution_enum = Distribution::uniform_real;
+    }
+    else if(distribution_arg == "normal")
+    {
+        distribution_enum = Distribution::normal;
+    }
+    else if(distribution_arg == "poisson")
+    {
+        distribution_enum = Distribution::poisson;
+    }
+    else
+    {
+        std::cerr << distribution_arg << (" is not a valid distribution.") << std::endl;
+        exit(error_exit_code);
+    }
+    return distribution_enum;
+}
+
 } // namespace
 
 int main(const int argc, const char** argv)
 {
-    CliArguments args;
-    try
+
+    // Get the number of hip devices in the system
+    int number_of_devies = 0;
+    HIP_CHECK(hipGetDeviceCount(&number_of_devies))
+
+    if(number_of_devies <= 0)
     {
-        // Parsing command line arguments. If something unexpected happens (e.g. missing arguments or
-        // wrong format), an exception is thrown.
-        args = parse_args(argc, argv);
-        // The parsed arguments are logged to the output to provide feedback to the user.
-        // For implementation, see `std::ostream& operator<<(std::ostream& os, const CliArguments& cli_args)`
-        std::cout << args << std::endl;
+        std::cerr << "HIP supported devices not found!"
+                  << "\n";
+        exit(error_exit_code);
     }
-    catch(const std::exception& ex)
+
+    // Parse user inputs
+    cli::Parser parser(argc, argv);
+    configure_parser(parser);
+    parser.run_and_exit_if_error();
+
+    // Get user arguments, if provided.
+    const int device_id = parser.get<int>("device");
+    if(device_id < 0 || device_id >= number_of_devies)
     {
-        // The exception is caught, and an error message and the command line help is printed.
-        // The program returns with a non-zero exit code.
-        std::cerr << "Could not parse arguments. Error: "s.append(ex.what()) << "\n"
-                  << cli_usage_message << std::endl;
-        return error_exit_code;
+        std::cerr << "Invalid device id " << device_id << "!\n"
+                  << "Device does not exist\n";
+        exit(error_exit_code);
     }
 
+    Distribution distribution = get_distribution(parser.get<std::string>("distribution"));
+    size_t       size         = parser.get<size_t>("size");
+    bool         print        = parser.get<bool>("print");
+
     // Set up the used device (GPU) according to the command line supplied argument.
-    set_device(args.device_id_);
+    set_device(device_id);
 
     // Run the selected measurement on the device (GPU) and host (CPU).
-    dispatch_distribution_type(args.distribution_, args.size_, args.print_);
+    dispatch_distribution_type(distribution, size, print);
 }
diff --git a/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj b/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj
index f0df0007..a7664232 100644
--- a/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj
+++ b/Libraries/rocRAND/simple_distributions_cpp/simple_distributions_cpp_vs2019.vcxproj
@@ -1,102 +1,104 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="main.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\..\Common\example_utils.hpp" />
-    <ClInclude Include="argument_parsing.hpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{13bb009a-0679-49c0-a763-3f0a388ea78f}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>simple_distributions_cpp_vs2019</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>HIP</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <TargetName>rocrand_$(ProjectName)</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <TargetName>rocrand_$(ProjectName)</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level1</WarningLevel>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level2</WarningLevel>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
-  </ImportGroup>
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\Common\example_utils.hpp" />
+    <ClInclude Include="argument_parsing.hpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{13bb009a-0679-49c0-a763-3f0a388ea78f}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>simple_distributions_cpp_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(HIPPropertiesImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>rocrand_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>rocrand_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level1</WarningLevel>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>__HIP_ROCclr__;__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(MSBuildProjectDirectory)\..\..\..\Common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>rocrand.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
+  </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/Libraries/rocThrust/norm/main.hip b/Libraries/rocThrust/norm/main.hip
index 0b89fb63..2b286274 100644
--- a/Libraries/rocThrust/norm/main.hip
+++ b/Libraries/rocThrust/norm/main.hip
@@ -27,6 +27,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
 
 #include "example_utils.hpp"
 
@@ -70,4 +71,4 @@ int main()
 
     // print the Euclidean norm
     std::cout << "The Euclidean norm is: " << norm << std::endl;
-}
\ No newline at end of file
+}
diff --git a/Libraries/rocThrust/remove_points/Makefile b/Libraries/rocThrust/remove_points/Makefile
index a21a9ef5..068b60ae 100644
--- a/Libraries/rocThrust/remove_points/Makefile
+++ b/Libraries/rocThrust/remove_points/Makefile
@@ -26,7 +26,7 @@ CXX_STD = c++17
 COMMON_INCLUDE_DIR = ../../../Common
 
 rocthrust_remove_points: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
-	$(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
+	$(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
 
 clean:
 	rm -f rocthrust_remove_points
diff --git a/Libraries/rocThrust/saxpy/Makefile b/Libraries/rocThrust/saxpy/Makefile
index 887610db..1587cdcb 100644
--- a/Libraries/rocThrust/saxpy/Makefile
+++ b/Libraries/rocThrust/saxpy/Makefile
@@ -26,7 +26,7 @@ CXX_STD = c++17
 COMMON_INCLUDE_DIR = ../../../Common
 
 rocthrust_saxpy: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
-	$(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
+	$(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
 
 clean:
 	rm -f rocthrust_saxpy
diff --git a/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj b/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj
index 169f0497..c12faddc 100644
--- a/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj
+++ b/Libraries/rocThrust/saxpy/saxpy_vs2019.vcxproj
@@ -52,9 +52,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <TargetName>rocthrust_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
+    <TargetName>rocthrust_$(ProjectName)</TargetName>
   </PropertyGroup>
   <PropertyGroup Label="HIP" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetGPUArchitectures>gfx1030</TargetGPUArchitectures>
@@ -94,4 +96,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Condition="'$(HIPTargetsImported)' != 'true'" Project="$(VCTargetsPath)\AMD.HIP.Common.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Libraries/rocThrust/vectors/Makefile b/Libraries/rocThrust/vectors/Makefile
index a441a896..cfc50037 100644
--- a/Libraries/rocThrust/vectors/Makefile
+++ b/Libraries/rocThrust/vectors/Makefile
@@ -26,7 +26,7 @@ CXX_STD = c++17
 COMMON_INCLUDE_DIR = ../../../Common
 
 rocthrust_vectors: main.hip $(COMMON_INCLUDE_DIR)/example_utils.hpp
-	$(HIPCXX) $^ -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
+	$(HIPCXX) $< -std=$(CXX_STD) -I $(COMMON_INCLUDE_DIR) -o $@
 
 clean:
 	rm -f rocthrust_vectors
diff --git a/README.md b/README.md
index 80eb7538..55d7a3a1 100644
--- a/README.md
+++ b/README.md
@@ -3,18 +3,24 @@ This project is currently unsupported and in an early testing stage. Feedback on
 ## Repository Contents
 - [Common](/Common/) contains common utility functionality shared between the examples.
 - [HIP-Basic](/HIP-Basic/) hosts self-contained recipes showcasing HIP runtime functionality.
+    - [assembly_to_executable](/HIP-Basic/assembly_to_executable): Program and accompanying build systems that show how to manually compile and link a HIP application from host and device code.
+    - [bandwidth](/HIP-Basic/bandwidth): Program that measures memory bandwidth from host to device, device to host, and device to device.
     - [device_query](/HIP-Basic/device_query): Program that showcases how properties from the device may be queried.
     - [dynamic_shared](/HIP-Basic/dynamic_shared): Program that showcases how to use dynamic shared memory with the help of a simple matrix transpose kernel.
     - [events](/HIP-Basic/events/): Measuring execution time and synchronizing with HIP events.
     - [hello_world](/HIP-Basic/hello_world): Simple program that showcases launching kernels and printing from the device.
     - [hipify](/HIP-Basic/hipify): Simple program and build definitions that showcase automatically converting a CUDA `.cu` source into portable HIP `.hip` source.
+    - [llvm_ir_to_executable](/HIP-Basic/llvm_ir_to_executable): Shows how to create a HIP executable from LLVM IR.
     - [matrix_multiplication](/HIP-Basic/matrix_multiplication/): Multiply two dynamically sized matrices utilizing shared memory.
     - [occupancy](/HIP-Basic/occupancy/): Shows how to find optimal configuation parameters for a kernel launch with maximum occupancy.
+    - [runtime_compilation](/HIP-Basic/runtime_compilation/): Simple program that showcases how to use HIP runtime compilation (hipRTC) to compile a kernel and launch it on a device.
     - [saxpy](/HIP-Basic/saxpy/): Implements the $Y_i=aX_i+Y_i$ kernel and explains basic HIP functionality.
+    - [shared_memory](/HIP-Basic/shared_memory/): Showcases how to use static shared memory by implementing a simple matrix transpose kernel.
     - [streams](/HIP-Basic/streams/): Program that showcases usage of multiple streams each with their own tasks.
-- [Dockerfiles](/Dockerfiles/) hosts Dockerfiles with ready-to-use environments for the various samples. See [Dockerfiles/README.md](Dockerfiles/README.md) for details.
-- [docs](/docs/)
-    - [CONTRIBUTING.md](docs/CONTRIBUTING.md) contains information on how to contribute to the examples.
+    - [warp_shuffle](/HIP-Basic/warp_shuffle/): Uses a simple matrix transpose kernel to showcase how to use warp shuffle operations.
+- [Dockerfiles](/Dockerfiles/) hosts Dockerfiles with ready-to-use environments for the various samples. See [Dockerfiles/README.md](/Dockerfiles/README.md) for details.
+- [Docs](/Docs/)
+    - [CONTRIBUTING.md](/Docs/CONTRIBUTING.md) contains information on how to contribute to the examples.
 - [Libraries](/Libraries/)
     - [hipCUB](/Libraries/hipCUB/)
         - [device_radix_sort](/Libraries/hipCUB/device_radix_sort/): Simple program that showcases `hipcub::DeviceRadixSort::SortPairs`.
diff --git a/ROCm-Examples-VS2019.sln b/ROCm-Examples-VS2019.sln
index 1e2cfc94..675964e0 100644
--- a/ROCm-Examples-VS2019.sln
+++ b/ROCm-Examples-VS2019.sln
@@ -1,4 +1,3 @@
-
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 16
 VisualStudioVersion = 16.0.32630.194
@@ -49,12 +48,24 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_multiplication_vs201
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "occupancy_vs2019", "HIP-Basic\occupancy\occupancy_vs2019.vcxproj", "{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "runtime_compilation_vs2019", "HIP-Basic\runtime_compilation\runtime_compilation_vs2019.vcxproj", "{E03790B7-B203-4504-BEF5-F4F061183642}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dynamic_shared_vs2019", "HIP-Basic\dynamic_shared\dynamic_shared_vs2019.vcxproj", "{7B7D1745-7635-40DA-B6AF-B8F728A31124}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "shared_memory_vs2019", "HIP-Basic\shared_memory\shared_memory_vs2019.vcxproj", "{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "streams_vs2019", "HIP-Basic\streams\streams_vs2019.vcxproj", "{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "events_vs2019", "HIP-Basic\events\events_vs2019.vcxproj", "{5B822836-110B-44D8-8E02-2A9B2CB83D14}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidth_vs2019", "HIP-Basic\bandwidth\bandwidth_vs2019.vcxproj", "{16B11B54-CD72-43B6-B226-38C668B41A79}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_shuffle_vs2019", "HIP-Basic\warp_shuffle\warp_shuffle_vs2019.vcxproj", "{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "assembly_to_executable_vs2019", "HIP-Basic\assembly_to_executable\assembly_to_executable_vs2019.vcxproj", "{60B4ADE0-8286-46AE-B884-5DA51B541DED}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "llvm_ir_to_executable_vs2019", "HIP-Basic\llvm_ir_to_executable\llvm_ir_to_executable_vs2019.vcxproj", "{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -125,10 +136,18 @@ Global
 		{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Debug|x64.Build.0 = Debug|x64
 		{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Release|x64.ActiveCfg = Release|x64
 		{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5}.Release|x64.Build.0 = Release|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.ActiveCfg = Debug|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Debug|x64.Build.0 = Debug|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.ActiveCfg = Release|x64
+		{E03790B7-B203-4504-BEF5-F4F061183642}.Release|x64.Build.0 = Release|x64
 		{7B7D1745-7635-40DA-B6AF-B8F728A31124}.Debug|x64.ActiveCfg = Debug|x64
 		{7B7D1745-7635-40DA-B6AF-B8F728A31124}.Debug|x64.Build.0 = Debug|x64
 		{7B7D1745-7635-40DA-B6AF-B8F728A31124}.Release|x64.ActiveCfg = Release|x64
 		{7B7D1745-7635-40DA-B6AF-B8F728A31124}.Release|x64.Build.0 = Release|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.ActiveCfg = Debug|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Debug|x64.Build.0 = Debug|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.ActiveCfg = Release|x64
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D}.Release|x64.Build.0 = Release|x64
 		{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Debug|x64.ActiveCfg = Debug|x64
 		{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Debug|x64.Build.0 = Debug|x64
 		{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9}.Release|x64.ActiveCfg = Release|x64
@@ -137,6 +156,22 @@ Global
 		{5B822836-110B-44D8-8E02-2A9B2CB83D14}.Debug|x64.Build.0 = Debug|x64
 		{5B822836-110B-44D8-8E02-2A9B2CB83D14}.Release|x64.ActiveCfg = Release|x64
 		{5B822836-110B-44D8-8E02-2A9B2CB83D14}.Release|x64.Build.0 = Release|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.ActiveCfg = Debug|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Debug|x64.Build.0 = Debug|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.ActiveCfg = Release|x64
+		{16B11B54-CD72-43B6-B226-38C668B41A79}.Release|x64.Build.0 = Release|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.ActiveCfg = Debug|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Debug|x64.Build.0 = Debug|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.ActiveCfg = Release|x64
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045}.Release|x64.Build.0 = Release|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.ActiveCfg = Debug|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Debug|x64.Build.0 = Debug|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.ActiveCfg = Release|x64
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED}.Release|x64.Build.0 = Release|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.ActiveCfg = Debug|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Debug|x64.Build.0 = Debug|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.ActiveCfg = Release|x64
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -155,7 +190,7 @@ Global
 		{631C61AA-52BA-4818-BD39-FA9CF47076C7} = {481D0AFC-64BC-436C-9FF5-7C07F9F8E4BD}
 		{E1D552CF-3FE3-427A-95E1-8CFFB60BBF8E} = {481D0AFC-64BC-436C-9FF5-7C07F9F8E4BD}
 		{0A489EDA-4BAD-4966-B439-37260D37D969} = {052412EF-7CEB-4E32-96F9-AADBC70945D7}
-		{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D} = {0A489EDA-4BAD-4966-B439-37260D37D969}
+		{B885EF49-EDAA-4474-8D31-E0EF71D2BB3D} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{82BF226F-956B-4E2E-B295-71C17F33A5FB} = {052412EF-7CEB-4E32-96F9-AADBC70945D7}
 		{E71DB5FB-A1C4-4BB4-8B46-0037C32C885E} = {82BF226F-956B-4E2E-B295-71C17F33A5FB}
 		{65B21869-2BE2-4DA5-BEC5-28D1F910731C} = {82BF226F-956B-4E2E-B295-71C17F33A5FB}
@@ -163,9 +198,15 @@ Global
 		{D6334F08-D560-439A-A704-ADA0349D72B7} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{ACC2A1E7-5865-4FAE-9016-E6EF73F8FA9E} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{E5B2FC79-3928-47F6-B57B-33AAA3C5D9C5} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{E03790B7-B203-4504-BEF5-F4F061183642} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{7B7D1745-7635-40DA-B6AF-B8F728A31124} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{C370ACB7-AE52-4AD8-8C3D-4C32567FFE7D} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{4E6B2034-D7ED-4CB4-98B2-7B2D2B71E0A9} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 		{5B822836-110B-44D8-8E02-2A9B2CB83D14} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{16B11B54-CD72-43B6-B226-38C668B41A79} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{5852BE0E-BDA5-4BD9-8A16-30E8E40F4045} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{60B4ADE0-8286-46AE-B884-5DA51B541DED} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
+		{DBB8DFE9-CB1B-473C-937C-2A8120E0D819} = {6EB7144D-2707-489E-A043-D59B7BE006D1}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {90580497-38BF-428E-A951-6EC6CFC68193}
diff --git a/scripts/code-format/check-format.sh b/Scripts/CodeFormat/check_format.sh
similarity index 100%
rename from scripts/code-format/check-format.sh
rename to Scripts/CodeFormat/check_format.sh