cuda-samples/Samples/0_Introduction/simpleIPC/simpleIPC.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This sample demonstrates Inter Process Communication
 * using one process per GPU for computation.
 */
#include <stdio.h>
#include <stdlib.h>
#include <vector>

#include "helper_cuda.h"
#include "helper_multiprocess.h"
static const char shmName[] = "simpleIPCshm";
// For direct NVLINK and PCI-E peers, at max 8 simultaneous peers are allowed
// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
// in the same way.
#define MAX_DEVICES (32)
#define DATA_SIZE   (64ULL << 20ULL) // 64MB

#if defined(__linux__)
#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define cpu_atomic_add32(a, x) InterlockedAdd((volatile LONG *)a, x)
#else
#error Unsupported system
#endif

typedef struct shmStruct_st
{
    size_t               nprocesses;
    int                  barrier;
    int                  sense;
    int                  devices[MAX_DEVICES];
    cudaIpcMemHandle_t   memHandle[MAX_DEVICES];
    cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
} shmStruct;

__global__ void simpleKernel(char *ptr, int sz, char val)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
        ptr[idx] = val;
    }
}

static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
{
    int count;

    // Check-in
    count = cpu_atomic_add32(barrier, 1);
    if (count == n) // Last one in
        *sense = 1;
    while (!*sense)
        ;

    // Check-out
    count = cpu_atomic_add32(barrier, -1);
    if (count == 0) // Last one out
        *sense = 0;
    while (*sense)
        ;
}

static void childProcess(int id)
{
    volatile shmStruct      *shm = NULL;
    cudaStream_t             stream;
    sharedMemoryInfo         info;
    size_t                   procCount, i;
    int                      blocks  = 0;
    int                      threads = 128;
    cudaDeviceProp           prop;
    std::vector<void *>      ptrs;
    std::vector<cudaEvent_t> events;
    std::vector<char>        verification_buffer(DATA_SIZE);
    pid_t                    pid;
    char                     pidString[20] = {0};
    char                     lshmName[40] = {0};

    pid = getppid();
    snprintf(pidString, sizeof(pidString), "%d", pid);
    strcat(lshmName, shmName);
    strcat(lshmName, pidString);

    printf("CP: lshmName = %s\n", lshmName);

    if (sharedMemoryOpen(lshmName, sizeof(shmStruct), &info) != 0) {
        printf("Failed to create shared memory slab\n");
        exit(EXIT_FAILURE);
    }
    shm       = (volatile shmStruct *)info.addr;
    procCount = shm->nprocesses;

    printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);

    checkCudaErrors(cudaSetDevice(shm->devices[id]));
    checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
    blocks *= prop.multiProcessorCount;

    // Open and track all the allocations and events created in the master
    // process for use later
    for (i = 0; i < procCount; i++) {
        void       *ptr = NULL;
        cudaEvent_t event;

        // Notice, we don't need to explicitly enable peer access for
        // allocations on other devices.
        checkCudaErrors(
            cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
        checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));

        ptrs.push_back(ptr);
        events.push_back(event);
    }

    // At each iteration of the loop, each sibling process will push work on
    // their respective devices accessing the next peer mapped buffer allocated
    // by the master process (these can come from other sibling processes as
    // well). To coordinate each process' access, we force the stream to wait for
    // the work already accessing this buffer asynchronously through IPC events,
    // allowing the CPU processes to continue to queue more work.
    for (i = 0; i < procCount; i++) {
        size_t bufferId = (i + id) % procCount;
        // Wait for the buffer to be accessed to be ready
        checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
        // Push a simple kernel on it
        simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
        checkCudaErrors(cudaGetLastError());
        // Signal that this buffer is ready for the next consumer
        checkCudaErrors(cudaEventRecord(events[bufferId], stream));
        // Wait for all my sibling processes to push this stage of their work
        // before proceeding to the next. This prevents siblings from racing
        // ahead and clobbering the recorded event or waiting on the wrong
        // recorded event.
        barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
        if (id == 0) {
            printf("Step %lld done\n", (unsigned long long)i);
        }
    }

    // Now wait for my buffer to be ready so I can copy it locally and verify it
    checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
    checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
    // And wait for all the queued up work to complete
    checkCudaErrors(cudaStreamSynchronize(stream));

    printf("Process %d: verifying...\n", id);

    // The contents should have the id of the sibling just after me
    char compareId = (char)((id + 1) % procCount);
    for (unsigned long long j = 0; j < DATA_SIZE; j++) {
        if (verification_buffer[j] != compareId) {
            printf("Process %d: Verification mismatch at %lld: %d != %d\n",
                   id,
                   j,
                   (int)verification_buffer[j],
                   (int)compareId);
        }
    }

    // Clean up!
    for (i = 0; i < procCount; i++) {
        checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
        checkCudaErrors(cudaEventDestroy(events[i]));
    }

    checkCudaErrors(cudaStreamDestroy(stream));

    printf("Process %d complete!\n", id);
}

static void parentProcess(char *app)
{
    sharedMemoryInfo         info;
    int                      devCount, i;
    volatile shmStruct      *shm = NULL;
    std::vector<void *>      ptrs;
    std::vector<cudaEvent_t> events;
    std::vector<Process>     processes;
    pid_t                    pid;
    char                     pidString[20] = {0};
    char                     lshmName[40] = {0};

    pid = getpid();
    snprintf(pidString, sizeof(pidString), "%d", pid);
    strcat(lshmName, shmName);
    strcat(lshmName, pidString);

    printf("PP: lshmName = %s\n", lshmName);

    checkCudaErrors(cudaGetDeviceCount(&devCount));

    if (sharedMemoryCreate(lshmName, sizeof(*shm), &info) != 0) {
        printf("Failed to create shared memory slab\n");
        exit(EXIT_FAILURE);
    }
    shm = (volatile shmStruct *)info.addr;
    memset((void *)shm, 0, sizeof(*shm));

    // Pick all the devices that can access each other's memory for this test
    // Keep in mind that CUDA has minimal support for fork() without a
    // corresponding exec() in the child process, but in this case our
    // spawnProcess will always exec, so no need to worry.
    for (i = 0; i < devCount; i++) {
        bool           allPeers = true;
        cudaDeviceProp prop;
        checkCudaErrors(cudaGetDeviceProperties(&prop, i));

        // CUDA IPC is only supported on devices with unified addressing
        if (!prop.unifiedAddressing) {
            printf("Device %d does not support unified addressing, skipping...\n", i);
            continue;
        }
        // This sample requires two processes accessing each device, so we need
        // to ensure exclusive or prohibited mode is not set
        if (prop.computeMode != cudaComputeModeDefault) {
            printf("Device %d is in an unsupported compute mode for this sample\n", i);
            continue;
        }

        for (int j = 0; j < shm->nprocesses; j++) {
            int canAccessPeerIJ, canAccessPeerJI;
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
            if (!canAccessPeerIJ || !canAccessPeerJI) {
                allPeers = false;
                break;
            }
        }
        if (allPeers) {
            // Enable peers here.  This isn't necessary for IPC, but it will
            // setup the peers for the device.  For systems that only allow 8
            // peers per GPU at a time, this acts to remove devices from CanAccessPeer
            for (int j = 0; j < shm->nprocesses; j++) {
                checkCudaErrors(cudaSetDevice(i));
                checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
                checkCudaErrors(cudaSetDevice(shm->devices[j]));
                checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
            }
            shm->devices[shm->nprocesses++] = i;
            if (shm->nprocesses >= MAX_DEVICES)
                break;
        }
        else {
            printf("Device %d is not peer capable with some other selected peers, "
                   "skipping\n",
                   i);
        }
    }

    if (shm->nprocesses == 0) {
        printf("No CUDA devices support IPC\n");
        exit(EXIT_WAIVED);
    }

    // Now allocate memory and an event for each process and fill the shared
    // memory buffer with the IPC handles to communicate
    for (i = 0; i < shm->nprocesses; i++) {
        void       *ptr = NULL;
        cudaEvent_t event;

        checkCudaErrors(cudaSetDevice(shm->devices[i]));
        checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
        checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
        checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
        checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));

        ptrs.push_back(ptr);
        events.push_back(event);
    }

    // Launch the child processes!
    for (i = 0; i < shm->nprocesses; i++) {
        char        devIdx[12]; // Increased size to ensure enough space for formatted integer
        char *const args[] = {app, devIdx, NULL};
        Process     process;

        snprintf(devIdx, sizeof(devIdx), "%d", i);

        if (spawnProcess(&process, app, args)) {
            printf("Failed to create process\n");
            exit(EXIT_FAILURE);
        }

        processes.push_back(process);
    }

    // And wait for them to finish
    for (i = 0; i < processes.size(); i++) {
        if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
            printf("Process %d failed!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    // Clean up!
    for (i = 0; i < shm->nprocesses; i++) {
        checkCudaErrors(cudaSetDevice(shm->devices[i]));
        checkCudaErrors(cudaEventSynchronize(events[i]));
        checkCudaErrors(cudaEventDestroy(events[i]));
        checkCudaErrors(cudaFree(ptrs[i]));
    }

    sharedMemoryClose(&info);
}

int main(int argc, char **argv)
{
#if defined(__arm__) || defined(__aarch64__)
    printf("Not supported on ARM\n");
    return EXIT_WAIVED;
#else
    if (argc == 1) {
        parentProcess(argv[0]);
    }
    else {
        childProcess(atoi(argv[1]));
    }
    return EXIT_SUCCESS;
#endif
}