You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
364 lines
12 KiB
364 lines
12 KiB
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions |
|
* are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of NVIDIA CORPORATION nor the names of its |
|
* contributors may be used to endorse or promote products derived |
|
* from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
*/ |
|
|
|
/* |
|
* Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two |
|
* memcopies with kernel execution. This sample illustrates the usage of CUDA |
|
* streams to achieve overlapping of kernel execution with copying data to and |
|
* from the device. |
|
* |
|
* Additionally, this sample uses CUDA events to measure elapsed time for |
|
* CUDA calls. Events are a part of CUDA API and provide a system independent |
|
* way to measure execution times on CUDA devices with approximately 0.5 |
|
* microsecond precision. |
|
* |
|
* Elapsed times are averaged over nreps repetitions (10 by default). |
|
* |
|
*/ |
|
|
|
const char *sSDKname = "simpleMultiCopy"; |
|
|
|
// includes, system |
|
#include <stdio.h> |
|
|
|
// include CUDA |
|
#include <cuda_runtime.h> |
|
|
|
// includes, project |
|
#include <helper_cuda.h> |
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples |
|
|
|
// includes, kernels |
|
// Declare the CUDA kernels here and main() code that is needed to launch |
|
// Compute workload on the system |
|
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) |
|
{ |
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
if (idx < N) { |
|
for (int i = 0; i < inner_reps; ++i) { |
|
g_out[idx] = g_in[idx] + 1; |
|
} |
|
} |
|
} |
|
|
|
#define STREAM_COUNT 4 |
|
|
|
// Uncomment to simulate data source/sink IO times |
|
// #define SIMULATE_IO |
|
|
|
int *h_data_source; |
|
int *h_data_sink; |
|
|
|
int *h_data_in[STREAM_COUNT]; |
|
int *d_data_in[STREAM_COUNT]; |
|
|
|
int *h_data_out[STREAM_COUNT]; |
|
int *d_data_out[STREAM_COUNT]; |
|
|
|
cudaEvent_t cycleDone[STREAM_COUNT]; |
|
cudaStream_t stream[STREAM_COUNT]; |
|
|
|
cudaEvent_t start, stop; |
|
|
|
int N = 1 << 22; |
|
int nreps = 10; // number of times each experiment is repeated |
|
int inner_reps = 5; |
|
|
|
int memsize; |
|
|
|
dim3 block(512); |
|
dim3 grid; |
|
|
|
int thread_blocks; |
|
|
|
float processWithStreams(int streams_used); |
|
void init(); |
|
bool test(); |
|
|
|
//////////////////////////////////////////////////////////////////////////////// |
|
// Program main |
|
//////////////////////////////////////////////////////////////////////////////// |
|
int main(int argc, char *argv[]) |
|
{ |
|
int cuda_device = 0; |
|
float scale_factor; |
|
cudaDeviceProp deviceProp; |
|
|
|
printf("[%s] - Starting...\n", sSDKname); |
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) { |
|
cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device="); |
|
|
|
if (cuda_device < 0) { |
|
printf("Invalid command line parameters\n"); |
|
exit(EXIT_FAILURE); |
|
} |
|
else { |
|
printf("cuda_device = %d\n", cuda_device); |
|
cuda_device = gpuDeviceInit(cuda_device); |
|
|
|
if (cuda_device < 0) { |
|
printf("No CUDA Capable devices found, exiting...\n"); |
|
exit(EXIT_SUCCESS); |
|
} |
|
} |
|
} |
|
else { |
|
// Otherwise pick the device with the highest Gflops/s |
|
cuda_device = gpuGetMaxGflopsDeviceId(); |
|
checkCudaErrors(cudaSetDevice(cuda_device)); |
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); |
|
printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name); |
|
} |
|
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); |
|
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", |
|
deviceProp.name, |
|
deviceProp.multiProcessorCount, |
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), |
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); |
|
|
|
// Anything that is less than 32 Cores will have scaled down workload |
|
scale_factor = |
|
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), |
|
1.0f); |
|
N = (int)((float)N / scale_factor); |
|
|
|
printf("> Device name: %s\n", deviceProp.name); |
|
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n", |
|
deviceProp.major, |
|
deviceProp.minor, |
|
deviceProp.multiProcessorCount); |
|
printf("> scale_factor = %.2f\n", 1.0f / scale_factor); |
|
printf("> array_size = %d\n\n", N); |
|
|
|
memsize = N * sizeof(int); |
|
|
|
thread_blocks = N / block.x; |
|
|
|
grid.x = thread_blocks % 65535; |
|
grid.y = (thread_blocks / 65535 + 1); |
|
|
|
// Allocate resources |
|
|
|
h_data_source = (int *)malloc(memsize); |
|
h_data_sink = (int *)malloc(memsize); |
|
|
|
for (int i = 0; i < STREAM_COUNT; ++i) { |
|
checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault)); |
|
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize)); |
|
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize)); |
|
|
|
checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault)); |
|
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize)); |
|
|
|
checkCudaErrors(cudaStreamCreate(&stream[i])); |
|
checkCudaErrors(cudaEventCreate(&cycleDone[i])); |
|
|
|
cudaEventRecord(cycleDone[i], stream[i]); |
|
} |
|
|
|
cudaEventCreate(&start); |
|
cudaEventCreate(&stop); |
|
|
|
init(); |
|
|
|
// Kernel warmup |
|
incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps); |
|
|
|
// Time copies and kernel |
|
cudaEventRecord(start, 0); |
|
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0)); |
|
cudaEventRecord(stop, 0); |
|
cudaEventSynchronize(stop); |
|
|
|
float memcpy_h2d_time; |
|
cudaEventElapsedTime(&memcpy_h2d_time, start, stop); |
|
|
|
cudaEventRecord(start, 0); |
|
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0)); |
|
cudaEventRecord(stop, 0); |
|
cudaEventSynchronize(stop); |
|
|
|
float memcpy_d2h_time; |
|
cudaEventElapsedTime(&memcpy_d2h_time, start, stop); |
|
|
|
cudaEventRecord(start, 0); |
|
incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps); |
|
cudaEventRecord(stop, 0); |
|
cudaEventSynchronize(stop); |
|
|
|
float kernel_time; |
|
cudaEventElapsedTime(&kernel_time, start, stop); |
|
|
|
printf("\n"); |
|
printf("Relevant properties of this CUDA device\n"); |
|
printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " |
|
"(device property \"deviceOverlap\")\n", |
|
deviceProp.deviceOverlap ? "X" : " "); |
|
// printf("(%s) Can execute several GPU kernels simultaneously (compute |
|
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); |
|
printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n" |
|
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro " |
|
"4000/5000/6000/K5000)\n", |
|
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " "); |
|
|
|
printf("\n"); |
|
printf("Measured timings (throughput):\n"); |
|
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time); |
|
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); |
|
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time); |
|
|
|
printf("\n"); |
|
printf("Theoretical limits for speedup gained from overlapped data " |
|
"transfers:\n"); |
|
printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time); |
|
printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); |
|
printf("Compute can overlap with both data transfers: %f ms\n", |
|
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); |
|
|
|
// Process pipelined work |
|
float serial_time = processWithStreams(1); |
|
float overlap_time = processWithStreams(STREAM_COUNT); |
|
|
|
printf("\nAverage measured timings over %d repetitions:\n", nreps); |
|
printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps); |
|
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps); |
|
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps); |
|
|
|
printf("\nMeasured throughput:\n"); |
|
printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time); |
|
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time); |
|
|
|
// Verify the results, we will use the results for final output |
|
bool bResults = test(); |
|
|
|
// Free resources |
|
|
|
free(h_data_source); |
|
free(h_data_sink); |
|
|
|
for (int i = 0; i < STREAM_COUNT; ++i) { |
|
cudaFreeHost(h_data_in[i]); |
|
cudaFree(d_data_in[i]); |
|
|
|
cudaFreeHost(h_data_out[i]); |
|
cudaFree(d_data_out[i]); |
|
|
|
cudaStreamDestroy(stream[i]); |
|
cudaEventDestroy(cycleDone[i]); |
|
} |
|
|
|
cudaEventDestroy(start); |
|
cudaEventDestroy(stop); |
|
|
|
// Test result |
|
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE); |
|
} |
|
|
|
float processWithStreams(int streams_used) |
|
{ |
|
int current_stream = 0; |
|
|
|
float time; |
|
|
|
// Do processing in a loop |
|
// |
|
// Note: All memory commands are processed in the order they are issued, |
|
// independent of the stream they are enqueued in. Hence the pattern by |
|
// which the copy and kernel commands are enqueued in the stream |
|
// has an influence on the achieved overlap. |
|
|
|
cudaEventRecord(start, 0); |
|
|
|
for (int i = 0; i < nreps; ++i) { |
|
int next_stream = (current_stream + 1) % streams_used; |
|
|
|
#ifdef SIMULATE_IO |
|
// Store the result |
|
memcpy(h_data_sink, h_data_out[current_stream], memsize); |
|
|
|
// Read new input |
|
memcpy(h_data_in[next_stream], h_data_source, memsize); |
|
#endif |
|
|
|
// Ensure that processing and copying of the last cycle has finished |
|
cudaEventSynchronize(cycleDone[next_stream]); |
|
|
|
// Process current frame |
|
incKernel<<<grid, block, 0, stream[current_stream]>>>( |
|
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); |
|
|
|
// Upload next frame |
|
checkCudaErrors(cudaMemcpyAsync( |
|
d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream])); |
|
|
|
// Download current frame |
|
checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream], |
|
d_data_out[current_stream], |
|
memsize, |
|
cudaMemcpyDeviceToHost, |
|
stream[current_stream])); |
|
|
|
checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream])); |
|
|
|
current_stream = next_stream; |
|
} |
|
|
|
cudaEventRecord(stop, 0); |
|
|
|
cudaDeviceSynchronize(); |
|
|
|
cudaEventElapsedTime(&time, start, stop); |
|
|
|
return time; |
|
} |
|
|
|
void init() |
|
{ |
|
for (int i = 0; i < N; ++i) { |
|
h_data_source[i] = 0; |
|
} |
|
|
|
for (int i = 0; i < STREAM_COUNT; ++i) { |
|
memcpy(h_data_in[i], h_data_source, memsize); |
|
} |
|
} |
|
|
|
bool test() |
|
{ |
|
bool passed = true; |
|
|
|
for (int j = 0; j < STREAM_COUNT; ++j) { |
|
for (int i = 0; i < N; ++i) { |
|
passed &= (h_data_out[j][i] == 1); |
|
} |
|
} |
|
|
|
return passed; |
|
}
|
|
|