You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
555 lines
22 KiB
555 lines
22 KiB
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions |
|
* are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of NVIDIA CORPORATION nor the names of its |
|
* contributors may be used to endorse or promote products derived |
|
* from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
*/ |
|
|
|
// System includes |
|
#include <assert.h> |
|
#include <climits> |
|
#include <stdio.h> |
|
#include <vector> |
|
|
|
// CUDA runtime |
|
#include <cuda_runtime.h> |
|
|
|
// helper functions and utilities to work with CUDA |
|
#include <helper_cuda.h> |
|
#include <helper_functions.h> |
|
|
|
#define THREADS_PER_BLOCK 512 |
|
#define ALLOWABLE_VARIANCE 1.e-6f |
|
#define NUM_ELEMENTS 8000000 |
|
|
|
// Stores the square of each input element in output array |
|
__global__ void squareArray(const float *input, float *output, int numElements) |
|
{ |
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
if (idx < numElements) { |
|
output[idx] = input[idx] * input[idx]; |
|
} |
|
} |
|
|
|
// Stores the negative of each input element in output array |
|
__global__ void negateArray(const float *input, float *output, int numElements) |
|
{ |
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
if (idx < numElements) { |
|
output[idx] = input[idx] * -1; |
|
} |
|
} |
|
|
|
struct negSquareArrays |
|
{ |
|
float *input; |
|
float *square; |
|
float *negSquare; |
|
int numElements; |
|
size_t bytes; |
|
size_t numBlocks; |
|
}; |
|
|
|
void fillRandomly(float *array, int numElements) |
|
{ |
|
for (int n = 0; n < numElements; n++) { |
|
array[n] = rand() / (float)RAND_MAX; |
|
} |
|
} |
|
|
|
void resetOutputArrays(negSquareArrays *hostArrays) |
|
{ |
|
fillRandomly(hostArrays->square, hostArrays->numElements); |
|
fillRandomly(hostArrays->negSquare, hostArrays->numElements); |
|
} |
|
|
|
void prepareHostArrays(negSquareArrays *hostArrays) |
|
{ |
|
hostArrays->numElements = NUM_ELEMENTS; |
|
size_t bytes = hostArrays->numElements * sizeof(float); |
|
|
|
size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK; |
|
if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) { |
|
numBlocks++; |
|
} |
|
|
|
hostArrays->input = (float *)malloc(bytes); |
|
hostArrays->square = (float *)malloc(bytes); |
|
hostArrays->negSquare = (float *)malloc(bytes); |
|
hostArrays->bytes = bytes; |
|
hostArrays->numBlocks = numBlocks; |
|
|
|
fillRandomly(hostArrays->input, hostArrays->numElements); |
|
fillRandomly(hostArrays->square, hostArrays->numElements); |
|
fillRandomly(hostArrays->negSquare, hostArrays->numElements); |
|
} |
|
|
|
void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) |
|
{ |
|
cudaGraph_t graph; |
|
cudaGraphNode_t freeNode; |
|
|
|
checkCudaErrors(cudaGraphCreate(&graph, 0)); |
|
|
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr)); |
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); |
|
checkCudaErrors(cudaGraphDestroy(graph)); |
|
} |
|
|
|
/** |
|
* Demonstrates explicitly creating a CUDA graph including memory nodes. |
|
* createNegateSquaresGraphWithStreamCapture constructs an equivalent graph |
|
* using stream capture. |
|
* |
|
* If d_negSquare_out is non null, then: |
|
* 1) d_negSquare will not be freed; |
|
* 2) the value of d_negSquare_out will be set to d_negSquare. |
|
* |
|
* Diagram of the graph constructed by createNegateSquaresGraphExplicitly: |
|
* |
|
* alloc d_input |
|
* | |
|
* alloc d_square |
|
* | |
|
* Memcpy a to device |
|
* | |
|
* launch kernel squareArray ------->---- Memcpy d_square to host |
|
* | | |
|
* free d_input | |
|
* | | |
|
* allocate d_negSquare | |
|
* | | |
|
* launch kernel negateArray -------->--- free d_square |
|
* | |
|
* Memcpy d_negSquare to host |
|
* | |
|
* free d_negSquare |
|
*/ |
|
void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, |
|
int device, |
|
negSquareArrays *hostArrays, |
|
float **d_negSquare_out = NULL) |
|
{ |
|
// Array buffers on device |
|
float *d_input, *d_square, *d_negSquare; |
|
|
|
// Memory allocation parameters |
|
cudaMemAllocNodeParams allocParams; |
|
memset(&allocParams, 0, sizeof(allocParams)); |
|
allocParams.bytesize = hostArrays->bytes; |
|
allocParams.poolProps.allocType = cudaMemAllocationTypePinned; |
|
allocParams.poolProps.location.id = device; |
|
allocParams.poolProps.location.type = cudaMemLocationTypeDevice; |
|
|
|
// Kernel launch parameters |
|
cudaKernelNodeParams kernelNodeParams = {0}; |
|
kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1); |
|
kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); |
|
kernelNodeParams.sharedMemBytes = 0; |
|
kernelNodeParams.extra = NULL; |
|
|
|
cudaGraph_t graph; |
|
cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare; |
|
cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare; |
|
cudaGraphNode_t squareKernelNode, negateKernelNode; |
|
cudaGraphNode_t freeNodeInput, freeNodeSquare; |
|
|
|
// Buffer for storing graph node dependencies |
|
std::vector<cudaGraphNode_t> nodeDependencies; |
|
|
|
checkCudaErrors(cudaGraphCreate(&graph, 0)); |
|
|
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams)); |
|
d_input = (float *)allocParams.dptr; |
|
|
|
// To keep the graph structure simple (fewer branching dependencies), |
|
// allocNodeSquare should depend on allocNodeInput |
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph, &allocNodeInput, 1, &allocParams)); |
|
d_square = (float *)allocParams.dptr; |
|
|
|
// copyNodeInput needs to depend on allocNodeInput because copyNodeInput |
|
// writes to d_input. It does so here indirectly through allocNodeSquare. |
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeInput, |
|
graph, |
|
&allocNodeSquare, |
|
1, |
|
d_input, |
|
hostArrays->input, |
|
hostArrays->bytes, |
|
cudaMemcpyHostToDevice)); |
|
|
|
void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square, (void *)&(hostArrays->numElements)}; |
|
kernelNodeParams.func = (void *)squareArray; |
|
kernelNodeParams.kernelParams = (void **)squareKernelArgs; |
|
|
|
// Square kernel depends on copyNodeInput to ensure all data is on the device |
|
// before kernel launch. |
|
checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph, ©NodeInput, 1, &kernelNodeParams)); |
|
|
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeSquare, |
|
graph, |
|
&squareKernelNode, |
|
1, |
|
hostArrays->square, |
|
d_square, |
|
hostArrays->bytes, |
|
cudaMemcpyDeviceToHost)); |
|
|
|
// Free of d_input depends on the square kernel to ensure that d_input is not |
|
// freed while being read by the kernel. It also depends on the alloc of |
|
// d_input via squareKernelNode > copyNodeInput > allocNodeSquare > |
|
// allocNodeInput. |
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph, &squareKernelNode, 1, d_input)); |
|
|
|
// Allocation of C depends on free of A so CUDA can reuse the virtual address. |
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph, &freeNodeInput, 1, &allocParams)); |
|
d_negSquare = (float *)allocParams.dptr; |
|
|
|
if (d_negSquare == d_input) { |
|
printf("Check verified that d_negSquare and d_input share a virtual " |
|
"address.\n"); |
|
} |
|
|
|
void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare, (void *)&(hostArrays->numElements)}; |
|
kernelNodeParams.func = (void *)negateArray; |
|
kernelNodeParams.kernelParams = (void **)negateKernelArgs; |
|
|
|
checkCudaErrors(cudaGraphAddKernelNode(&negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams)); |
|
|
|
nodeDependencies.push_back(copyNodeSquare); |
|
nodeDependencies.push_back(negateKernelNode); |
|
checkCudaErrors( |
|
cudaGraphAddMemFreeNode(&freeNodeSquare, graph, nodeDependencies.data(), nodeDependencies.size(), d_square)); |
|
nodeDependencies.clear(); |
|
|
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeNegSquare, |
|
graph, |
|
&negateKernelNode, |
|
1, |
|
hostArrays->negSquare, |
|
d_negSquare, |
|
hostArrays->bytes, |
|
cudaMemcpyDeviceToHost)); |
|
|
|
if (d_negSquare_out == NULL) { |
|
cudaGraphNode_t freeNodeNegSquare; |
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare)); |
|
} |
|
else { |
|
*d_negSquare_out = d_negSquare; |
|
} |
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); |
|
checkCudaErrors(cudaGraphDestroy(graph)); |
|
} |
|
|
|
/** |
|
* Adds work to a CUDA stream which negates the square of values in the input |
|
* array. |
|
* |
|
* If d_negSquare_out is non null, then: |
|
* 1) d_negSquare will not be freed; |
|
* 2) the value of d_negSquare_out will be set to d_negSquare. |
|
* |
|
* Diagram of the stream operations in doNegateSquaresInStream |
|
* --------------------------------------------------------------------- |
|
* | STREAM | STREAM2 | |
|
* --------------------------------------------------------------------- |
|
* |
|
* alloc d_input |
|
* | |
|
* alloc d_square |
|
* | |
|
* Memcpy a to device |
|
* | |
|
* launch kernel squareArray |
|
* | |
|
* record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent |
|
* | | |
|
* free d_input | |
|
* | | |
|
* allocate d_negSquare Memcpy d_square to host |
|
* | | |
|
* launch kernel negateArray | |
|
* | | |
|
* record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent |
|
* | | |
|
* Memcpy d_negSquare to host | |
|
* | free d_square |
|
* free d_negSquare | |
|
* | | |
|
* wait squareFreeEvent --------------<---- record squareFreeEvent |
|
*/ |
|
void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, float **d_negSquare_out = NULL) |
|
{ |
|
float *d_input, *d_square, *d_negSquare; |
|
cudaStream_t stream2; |
|
cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent, squareFreeEvent; |
|
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); |
|
|
|
checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent)); |
|
checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent)); |
|
checkCudaErrors(cudaEventCreate(&squareFreeEvent)); |
|
|
|
// Virtual addresses are assigned synchronously when cudaMallocAsync is |
|
// called, thus there is no performace benefit gained by separating the |
|
// allocations into two streams. |
|
checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1)); |
|
checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1)); |
|
|
|
checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes, cudaMemcpyHostToDevice, stream1)); |
|
squareArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(d_input, d_square, hostArrays->numElements); |
|
checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1)); |
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0)); |
|
checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square, hostArrays->bytes, cudaMemcpyDeviceToHost, stream2)); |
|
|
|
checkCudaErrors(cudaFreeAsync(d_input, stream1)); |
|
checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1)); |
|
negateArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( |
|
d_square, d_negSquare, hostArrays->numElements); |
|
checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1)); |
|
checkCudaErrors( |
|
cudaMemcpyAsync(hostArrays->negSquare, d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost, stream1)); |
|
if (d_negSquare_out == NULL) { |
|
checkCudaErrors(cudaFreeAsync(d_negSquare, stream1)); |
|
} |
|
else { |
|
*d_negSquare_out = d_negSquare; |
|
} |
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0)); |
|
checkCudaErrors(cudaFreeAsync(d_square, stream2)); |
|
checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2)); |
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0)); |
|
|
|
checkCudaErrors(cudaStreamDestroy(stream2)); |
|
checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent)); |
|
checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent)); |
|
checkCudaErrors(cudaEventDestroy(squareFreeEvent)); |
|
} |
|
|
|
/** |
|
* Demonstrates creating a CUDA graph including memory nodes using stream |
|
* capture. createNegateSquaresGraphExplicitly constructs an equivalent graph |
|
* without stream capture. |
|
*/ |
|
void createNegateSquaresGraphWithStreamCapture(cudaGraphExec_t *graphExec, |
|
negSquareArrays *hostArrays, |
|
float **d_negSquare_out = NULL) |
|
{ |
|
cudaGraph_t graph; |
|
cudaStream_t stream; |
|
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); |
|
|
|
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); |
|
doNegateSquaresInStream(stream, hostArrays, d_negSquare_out); |
|
checkCudaErrors(cudaStreamEndCapture(stream, &graph)); |
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); |
|
checkCudaErrors(cudaStreamDestroy(stream)); |
|
checkCudaErrors(cudaGraphDestroy(graph)); |
|
} |
|
|
|
void prepareRefArrays(negSquareArrays *hostArrays, negSquareArrays *deviceRefArrays, bool **foundValidationFailure) |
|
{ |
|
deviceRefArrays->bytes = hostArrays->bytes; |
|
deviceRefArrays->numElements = hostArrays->numElements; |
|
|
|
for (int i = 0; i < hostArrays->numElements; i++) { |
|
hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i]; |
|
hostArrays->negSquare[i] = hostArrays->square[i] * -1; |
|
} |
|
|
|
checkCudaErrors(cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes)); |
|
checkCudaErrors( |
|
cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare, hostArrays->bytes, cudaMemcpyHostToDevice)); |
|
|
|
checkCudaErrors(cudaMallocManaged((void **)foundValidationFailure, sizeof(bool))); |
|
} |
|
|
|
int checkValidationFailure(bool *foundValidationFailure) |
|
{ |
|
if (*foundValidationFailure) { |
|
printf("Validation FAILURE!\n\n"); |
|
*foundValidationFailure = false; |
|
return EXIT_FAILURE; |
|
} |
|
else { |
|
printf("Validation PASSED!\n\n"); |
|
return EXIT_SUCCESS; |
|
} |
|
} |
|
|
|
__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays, bool *foundValidationFailure) |
|
{ |
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
float ref, diff; |
|
|
|
if (idx < devRefArrays.numElements) { |
|
ref = devRefArrays.negSquare[idx]; |
|
diff = d_negSquare[idx] - ref; |
|
diff *= diff; |
|
ref *= ref; |
|
if (diff / ref > ALLOWABLE_VARIANCE) { |
|
*foundValidationFailure = true; |
|
} |
|
} |
|
} |
|
|
|
void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) |
|
{ |
|
float ref, diff; |
|
|
|
for (int i = 0; i < hostArrays->numElements; i++) { |
|
ref = hostArrays->input[i] * hostArrays->input[i] * -1; |
|
diff = hostArrays->negSquare[i] - ref; |
|
diff *= diff; |
|
ref *= ref; |
|
if (diff / ref > ALLOWABLE_VARIANCE) { |
|
*foundValidationFailure = true; |
|
} |
|
} |
|
} |
|
|
|
int main(int argc, char **argv) |
|
{ |
|
negSquareArrays hostArrays, deviceRefArrays; |
|
cudaStream_t stream; |
|
cudaGraphExec_t graphExec, graphExecFreeC; |
|
|
|
// Declare pointers for GPU buffers |
|
float *d_negSquare = NULL; |
|
bool *foundValidationFailure = NULL; |
|
|
|
srand(time(0)); |
|
int device = findCudaDevice(argc, (const char **)argv); |
|
|
|
int driverVersion = 0; |
|
int deviceSupportsMemoryPools = 0; |
|
|
|
cudaDriverGetVersion(&driverVersion); |
|
printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); |
|
|
|
if (driverVersion < 11040) { |
|
printf("Waiving execution as driver does not support Graph Memory Nodes\n"); |
|
exit(EXIT_WAIVED); |
|
} |
|
|
|
cudaDeviceGetAttribute(&deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported, device); |
|
if (!deviceSupportsMemoryPools) { |
|
printf("Waiving execution as device does not support Memory Pools\n"); |
|
exit(EXIT_WAIVED); |
|
} |
|
else { |
|
printf("Setting up sample.\n"); |
|
} |
|
|
|
prepareHostArrays(&hostArrays); |
|
prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure); |
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); |
|
printf("Setup complete.\n\n"); |
|
|
|
printf("Running negateSquares in a stream.\n"); |
|
doNegateSquaresInStream(stream, &hostArrays); |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
printf("Validating negateSquares in a stream...\n"); |
|
validateHost(&hostArrays, foundValidationFailure); |
|
checkValidationFailure(foundValidationFailure); |
|
resetOutputArrays(&hostArrays); |
|
|
|
printf("Running negateSquares in a stream-captured graph.\n"); |
|
createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays); |
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream)); |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
printf("Validating negateSquares in a stream-captured graph...\n"); |
|
validateHost(&hostArrays, foundValidationFailure); |
|
checkValidationFailure(foundValidationFailure); |
|
resetOutputArrays(&hostArrays); |
|
|
|
printf("Running negateSquares in an explicitly constructed graph.\n"); |
|
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays); |
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream)); |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
printf("Validating negateSquares in an explicitly constructed graph...\n"); |
|
validateHost(&hostArrays, foundValidationFailure); |
|
checkValidationFailure(foundValidationFailure); |
|
resetOutputArrays(&hostArrays); |
|
|
|
// Each of the three examples below free d_negSquare outside the graph. As |
|
// demonstrated by validateGPU, d_negSquare can be accessed by outside the |
|
// graph before d_negSquare is freed. |
|
|
|
printf("Running negateSquares with d_negSquare freed outside the stream.\n"); |
|
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays, &d_negSquare); |
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream)); |
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>( |
|
d_negSquare, deviceRefArrays, foundValidationFailure); |
|
// Since cudaFree is synchronous, the stream must synchronize before freeing |
|
// d_negSquare to ensure d_negSquare no longer being accessed. |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
checkCudaErrors(cudaFree(d_negSquare)); |
|
printf("Validating negateSquares with d_negSquare freed outside the " |
|
"stream...\n"); |
|
validateHost(&hostArrays, foundValidationFailure); |
|
checkValidationFailure(foundValidationFailure); |
|
resetOutputArrays(&hostArrays); |
|
|
|
printf("Running negateSquares with d_negSquare freed outside the graph.\n"); |
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream)); |
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>( |
|
d_negSquare, deviceRefArrays, foundValidationFailure); |
|
checkCudaErrors(cudaFreeAsync(d_negSquare, stream)); |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
printf("Validating negateSquares with d_negSquare freed outside the graph...\n"); |
|
checkValidationFailure(foundValidationFailure); |
|
resetOutputArrays(&hostArrays); |
|
|
|
printf("Running negateSquares with d_negSquare freed in a different graph.\n"); |
|
createFreeGraph(&graphExecFreeC, d_negSquare); |
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream)); |
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>( |
|
d_negSquare, deviceRefArrays, foundValidationFailure); |
|
checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream)); |
|
checkCudaErrors(cudaStreamSynchronize(stream)); |
|
printf("Validating negateSquares with d_negSquare freed in a different " |
|
"graph...\n"); |
|
checkValidationFailure(foundValidationFailure); |
|
|
|
printf("Cleaning up sample.\n"); |
|
checkCudaErrors(cudaGraphExecDestroy(graphExec)); |
|
checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC)); |
|
checkCudaErrors(cudaStreamDestroy(stream)); |
|
checkCudaErrors(cudaFree(foundValidationFailure)); |
|
checkCudaErrors(cudaFree(deviceRefArrays.negSquare)); |
|
free(hostArrays.input); |
|
free(hostArrays.square); |
|
free(hostArrays.negSquare); |
|
printf("Cleanup complete. Exiting sample.\n"); |
|
}
|
|
|