You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
345 lines
12 KiB
345 lines
12 KiB
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions |
|
* are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of NVIDIA CORPORATION nor the names of its |
|
* contributors may be used to endorse or promote products derived |
|
* from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
*/ |
|
|
|
|
|
/* Matrix multiplication: C = A * B. |
|
* Host code. |
|
* |
|
* This sample revisits matrix multiplication with CUDA task. The code of matrix |
|
* multiplication is exactly the same as in matrixMulDrv sample of this SDK. |
|
* This sample, however, demonstrates how to link CUDA driver at runtime and |
|
* how to perform JIT (just-in-time) compilation of CUDA kernel from PTX image, |
|
* stored in memory. |
|
* |
|
* For more details on acquiring auto-generated sources refer README.TXT file |
|
* in "extras" directory. |
|
* |
|
* Unlike CUBLAS, the sample doesn't address high-performance matrix |
|
* multiplication. |
|
*/ |
|
|
|
// includes, system |
|
#include <math.h> |
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <string.h> |
|
|
|
// includes, CUDA |
|
#include "cuda_drvapi_dynlink.h" |
|
#include "helper_cuda_drvapi.h" |
|
|
|
// includes, project |
|
#include "matrixMul.h" |
|
#include "matrixMul_kernel_32_ptxdump.h" |
|
#include "matrixMul_kernel_64_ptxdump.h" |
|
|
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int); |
|
|
|
#if defined _MSC_VER |
|
#pragma warning(disable : 4312) |
|
#endif |
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
// Globals |
|
//////////////////////////////////////////////////////////////////////////////// |
|
CUcontext g_cuContext; |
|
bool noprompt = false; |
|
|
|
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)"; |
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
// Allocates a matrix with random float entries |
|
//////////////////////////////////////////////////////////////////////////////// |
|
void randomInit(float *data, size_t size) |
|
{ |
|
for (size_t i = 0; i < size; ++i) { |
|
data[i] = rand() / (float)RAND_MAX; |
|
} |
|
} |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
// CUDA driver runtime linking and initialization |
|
//////////////////////////////////////////////////////////////////////////////// |
|
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out) |
|
{ |
|
CUresult status; |
|
CUdevice cuDevice; |
|
CUmodule cuModule; |
|
CUfunction cuFunction; |
|
int major, minor, block_size, devID = 0; |
|
char deviceName[256]; |
|
|
|
// link to cuda driver dynamically |
|
checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); |
|
|
|
// This assumes that the user is attempting to specify a explicit device -device=n |
|
if (argc > 1) { |
|
bool bFound = false; |
|
|
|
for (int param = 0; param < argc; param++) { |
|
if (!strncmp(argv[param], "-device", 7)) { |
|
int i = (int)strlen(argv[1]); |
|
|
|
while (argv[1][i] != '=') { |
|
i--; |
|
} |
|
|
|
devID = atoi(&argv[1][++i]); |
|
bFound = true; |
|
} |
|
|
|
if (bFound) |
|
break; |
|
} |
|
} |
|
|
|
// get cuda-capable device count |
|
int deviceCount = 0; |
|
checkCudaErrors(cuDeviceGetCount(&deviceCount)); |
|
|
|
if (deviceCount == 0) { |
|
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n"); |
|
exit(EXIT_SUCCESS); |
|
} |
|
|
|
if (devID < 0) |
|
devID = 0; |
|
|
|
if (devID > deviceCount - 1) { |
|
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount); |
|
status = CUDA_ERROR_NOT_FOUND; |
|
|
|
cuCtxDestroy(g_cuContext); |
|
exit(EXIT_FAILURE); |
|
} |
|
|
|
// pick up device with zero ordinal (default, or devID) |
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID)); |
|
|
|
// get compute capabilities and the devicename |
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice)); |
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); |
|
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor); |
|
|
|
block_size = 32; |
|
*block_size_out = block_size; |
|
|
|
// create context for picked device |
|
status = cuCtxCreate(&g_cuContext, 0, cuDevice); |
|
|
|
if (CUDA_SUCCESS != status) { |
|
cuCtxDestroy(g_cuContext); |
|
exit(EXIT_SUCCESS); |
|
} |
|
|
|
// setup JIT compilation options and perform compilation |
|
{ |
|
// in this branch we use compilation with parameters |
|
const unsigned int jitNumOptions = 3; |
|
CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; |
|
void **jitOptVals = new void *[jitNumOptions]; |
|
|
|
// set up size of compilation log buffer |
|
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; |
|
int jitLogBufferSize = 1024; |
|
jitOptVals[0] = (void *)(size_t)jitLogBufferSize; |
|
|
|
// set up pointer to the compilation log buffer |
|
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; |
|
char *jitLogBuffer = new char[jitLogBufferSize]; |
|
jitOptVals[1] = jitLogBuffer; |
|
|
|
// set up pointer to set the Maximum # of registers for a particular kernel |
|
jitOptions[2] = CU_JIT_MAX_REGISTERS; |
|
int jitRegCount = 32; |
|
jitOptVals[2] = (void *)(size_t)jitRegCount; |
|
|
|
// compile with set parameters |
|
printf("> Compiling CUDA module\n"); |
|
|
|
#if defined(_WIN64) || defined(__LP64__) |
|
status = |
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); |
|
#else |
|
status = |
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); |
|
#endif |
|
|
|
printf("> PTX JIT log:\n%s\n", jitLogBuffer); |
|
|
|
delete[] jitOptions; |
|
delete[] jitOptVals; |
|
delete[] jitLogBuffer; |
|
} |
|
|
|
if (CUDA_SUCCESS != status) { |
|
printf("Error while compiling PTX\n"); |
|
cuCtxDestroy(g_cuContext); |
|
exit(EXIT_FAILURE); |
|
} |
|
|
|
// retrieve CUDA function from the compiled module |
|
status = cuModuleGetFunction( |
|
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); |
|
|
|
if (CUDA_SUCCESS != status) { |
|
cuCtxDestroy(g_cuContext); |
|
exit(EXIT_FAILURE); |
|
} |
|
|
|
*pMatrixMul = cuFunction; |
|
return CUDA_SUCCESS; |
|
} |
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
// Entry point |
|
//////////////////////////////////////////////////////////////////////////////// |
|
int main(int argc, char **argv) |
|
{ |
|
printf("[ %s ]\n", sSDKsample); |
|
|
|
// initialize CUDA |
|
CUfunction matrixMul = NULL; |
|
int block_size = 0; |
|
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size)); |
|
|
|
// set seed for rand() |
|
srand(2006); |
|
|
|
// allocate host memory for matrices A and B |
|
size_t size_A = WA * HA; |
|
size_t mem_size_A = sizeof(float) * size_A; |
|
size_t size_B = WB * HB; |
|
size_t mem_size_B = sizeof(float) * size_B; |
|
|
|
float *h_A = (float *)malloc(mem_size_A); |
|
float *h_B = (float *)malloc(mem_size_B); |
|
|
|
// initialize host memory |
|
randomInit(h_A, size_A); |
|
randomInit(h_B, size_B); |
|
|
|
// allocate device memory |
|
CUdeviceptr d_A; |
|
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); |
|
CUdeviceptr d_B; |
|
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); |
|
|
|
// copy host memory to device |
|
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); |
|
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); |
|
|
|
// allocate device memory for result |
|
size_t size_C = WC * HC; |
|
size_t mem_size_C = sizeof(float) * size_C; |
|
|
|
CUdeviceptr d_C; |
|
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); |
|
|
|
// allocate mem for the result on host side |
|
float *h_C = (float *)malloc(mem_size_C); |
|
|
|
#if __CUDA_API_VERSION >= 4000 |
|
{ |
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) |
|
int Matrix_Width_A = WA; |
|
int Matrix_Width_B = WB; |
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; |
|
|
|
checkCudaErrors(cuLaunchKernel( |
|
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL)); |
|
} |
|
#else // __CUDA_API_VERSION <= 3020 |
|
{ |
|
// This is the older CUDA Driver API for Kernel Parameter passing and Kernel Launching |
|
int offset = 0; |
|
{ |
|
// setup execution parameters |
|
checkCudaErrors(cuParamSetv(matrixMul, offset, &d_C, sizeof(d_C))); |
|
offset += sizeof(d_C); |
|
|
|
checkCudaErrors(cuParamSetv(matrixMul, offset, &d_A, sizeof(d_A))); |
|
offset += sizeof(d_A); |
|
|
|
checkCudaErrors(cuParamSetv(matrixMul, offset, &d_B, sizeof(d_B))); |
|
offset += sizeof(d_B); |
|
} |
|
|
|
int Matrix_Width_A = WA; |
|
int Matrix_Width_B = WB; |
|
|
|
checkCudaErrors(cuParamSeti(matrixMul, offset, Matrix_Width_A)); |
|
offset += sizeof(Matrix_Width_A); |
|
|
|
checkCudaErrors(cuParamSeti(matrixMul, offset, Matrix_Width_B)); |
|
offset += sizeof(Matrix_Width_B); |
|
|
|
checkCudaErrors(cuParamSetSize(matrixMul, offset)); |
|
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1)); |
|
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float))); |
|
|
|
// set execution configuration for the CUDA kernel |
|
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size)); |
|
} |
|
#endif |
|
|
|
checkCudaErrors(cuCtxSynchronize()); |
|
|
|
// copy result from device to host |
|
checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C)); |
|
|
|
// compute reference solution |
|
float *reference = (float *)malloc(mem_size_C); |
|
computeGold(reference, h_A, h_B, HA, WA, WB); |
|
|
|
// check result |
|
float diff = 0.0f; |
|
|
|
for (unsigned int i = 0; i < size_C; i++) { |
|
float tmp = reference[i] - h_C[i]; |
|
diff += tmp * tmp; |
|
} |
|
|
|
int res = (diff / (float)size_C < 1e-6f); |
|
|
|
// clean up memory |
|
free(h_A); |
|
free(h_B); |
|
free(h_C); |
|
free(reference); |
|
checkCudaErrors(cuMemFree(d_A)); |
|
checkCudaErrors(cuMemFree(d_B)); |
|
checkCudaErrors(cuMemFree(d_C)); |
|
checkCudaErrors(cuCtxDestroy(g_cuContext)); |
|
|
|
printf("Test run %s\n", (1 == res) ? "success!" : "failed!"); |
|
|
|
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE); |
|
}
|
|
|