cuda-samples/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#ifndef __cuda_drvapi_dynlink_cuda_h__
#define __cuda_drvapi_dynlink_cuda_h__

#include <stdlib.h>


#define __cuda_cuda_h__ 1

/**
 * CUDA API versioning support
 */
#define __CUDA_API_VERSION 5000

/**
 * \defgroup CUDA_DRIVER CUDA Driver API
 *
 * This section describes the low-level CUDA driver application programming
 * interface.
 *
 * @{
 */

/**
 * \defgroup CUDA_TYPES Data types used by CUDA driver
 * @{
 */

/**
 * CUDA API version number
 */
#define CUDA_VERSION 3020 /* 3.2 */

#ifdef __cplusplus
extern "C"
{
#endif

/**
 * CUDA device pointer
 */
#if __CUDA_API_VERSION >= 3020

#if defined(_WIN64) || defined(__LP64__)
    typedef unsigned long long CUdeviceptr;
#else
    typedef unsigned int CUdeviceptr;
#endif

#endif /* __CUDA_API_VERSION >= 3020 */

    typedef int                           CUdevice;           /**< CUDA device */
    typedef struct CUctx_st              *CUcontext;          /**< CUDA context */
    typedef struct CUmod_st              *CUmodule;           /**< CUDA module */
    typedef struct CUfunc_st             *CUfunction;         /**< CUDA function */
    typedef struct CUarray_st            *CUarray;            /**< CUDA array */
    typedef struct CUmipmappedArray_st   *CUmipmappedArray;   /**< CUDA mipmapped array */
    typedef struct CUtexref_st           *CUtexref;           /**< CUDA texture reference */
    typedef struct CUsurfref_st          *CUsurfref;          /**< CUDA surface reference */
    typedef struct CUevent_st            *CUevent;            /**< CUDA event */
    typedef struct CUstream_st           *CUstream;           /**< CUDA stream */
    typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
    typedef unsigned long long            CUtexObject;        /**< CUDA texture object */
    typedef unsigned long long            CUsurfObject;       /**< CUDA surface object */

    typedef struct CUuuid_st /**< CUDA definition of UUID */
    {
        char bytes[16];
    } CUuuid;

    /**
     * Context creation flags
     */
    typedef enum CUctx_flags_enum {
        CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
        CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
        CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
        CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
        CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
        CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
        CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
#if __CUDA_API_VERSION < 4000
        CU_CTX_SCHED_MASK = 0x03,
        CU_CTX_FLAGS_MASK = 0x1f
#else
    CU_CTX_SCHED_MASK = 0x07,
    CU_CTX_PRIMARY    = 0x20, /**< Initialize and return the primary context */
    CU_CTX_FLAGS_MASK = 0x3f
#endif
    } CUctx_flags;

    /**
     * Event creation flags
     */
    typedef enum CUevent_flags_enum {
        CU_EVENT_DEFAULT        = 0, /**< Default event flag */
        CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
        CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
    } CUevent_flags;

    /**
     * Array formats
     */
    typedef enum CUarray_format_enum {
        CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
        CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
        CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
        CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
        CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
        CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
    } CUarray_format;

    /**
     * Texture reference addressing modes
     */
    typedef enum CUaddress_mode_enum {
        CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
        CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
        CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
        CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
    } CUaddress_mode;

    /**
     * Texture reference filtering modes
     */
    typedef enum CUfilter_mode_enum {
        CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
        CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
    } CUfilter_mode;

    /**
     * Device properties
     */
    typedef enum CUdevice_attribute_enum {
        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK       = 1, /**< Maximum number of threads per block */
        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X             = 2, /**< Maximum block dimension X */
        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y             = 3, /**< Maximum block dimension Y */
        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z             = 4, /**< Maximum block dimension Z */
        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X              = 5, /**< Maximum grid dimension X */
        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y              = 6, /**< Maximum grid dimension Y */
        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z              = 7, /**< Maximum grid dimension Z */
        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK =
            8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =
            9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
        CU_DEVICE_ATTRIBUTE_WARP_SIZE               = 10, /**< Warp size in threads */
        CU_DEVICE_ATTRIBUTE_MAX_PITCH               = 11, /**< Maximum pitch in bytes allowed by memory copies */
        CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
        CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK =
            12,                                     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
        CU_DEVICE_ATTRIBUTE_CLOCK_RATE        = 13, /**< Peak clock frequency in kilohertz */
        CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
        CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT    = 16, /**< Number of multiprocessors on device */
        CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT     = 17, /**< Specifies whether there is a run time limit on kernels */
        CU_DEVICE_ATTRIBUTE_INTEGRATED              = 18, /**< Device is integrated with host memory */
        CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY     = 19, /**< Device can map host memory into CUDA address space */
        CU_DEVICE_ATTRIBUTE_COMPUTE_MODE            = 20, /**< Compute mode (See ::CUcomputemode for details) */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT          = 23, /**< Maximum 2D texture height */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH           = 24, /**< Maximum 3D texture width */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT          = 25, /**< Maximum 3D texture height */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH           = 26, /**< Maximum 3D texture depth */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH     = 27, /**< Maximum texture array width */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT    = 28, /**< Maximum texture array height */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
        CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                 = 30, /**< Alignment requirement for surfaces */
        CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
        CU_DEVICE_ATTRIBUTE_ECC_ENABLED        = 32, /**< Device has ECC support enabled */
        CU_DEVICE_ATTRIBUTE_PCI_BUS_ID         = 33, /**< PCI bus ID of the device */
        CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID      = 34, /**< PCI device ID of the device */
        CU_DEVICE_ATTRIBUTE_TCC_DRIVER         = 35, /**< Device is using TCC driver model */
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76  /**< Minor compute capability version number */
#if __CUDA_API_VERSION >= 4000
        ,
        CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE              = 36, /**< Peak memory clock frequency in kilohertz */
        CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH        = 37, /**< Global memory bus width in bits */
        CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                  = 38, /**< Size of L2 cache in bytes */
        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
        CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT             = 40, /**< Number of asynchronous engines */
        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  = 42, /**< Maximum 1D layered texture width */
        CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43  /**< Maximum layers in a 1D layered texture */
#endif
    } CUdevice_attribute;

    /**
     * Legacy device properties
     */
    typedef struct CUdevprop_st
    {
        int maxThreadsPerBlock;  /**< Maximum number of threads per block */
        int maxThreadsDim[3];    /**< Maximum size of each dimension of a block */
        int maxGridSize[3];      /**< Maximum size of each dimension of a grid */
        int sharedMemPerBlock;   /**< Shared memory available per block in bytes */
        int totalConstantMemory; /**< Constant memory available on device in bytes */
        int SIMDWidth;           /**< Warp size in threads */
        int memPitch;            /**< Maximum pitch in bytes allowed by memory copies */
        int regsPerBlock;        /**< 32-bit registers available per block */
        int clockRate;           /**< Clock frequency in kilohertz */
        int textureAlign;        /**< Alignment requirement for textures */
    } CUdevprop;

    /**
     * Function properties
     */
    typedef enum CUfunction_attribute_enum {
        /**
         * The maximum number of threads per block, beyond which a launch of the
         * function would fail. This number depends on both the function and the
         * device on which the function is currently loaded.
         */
        CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,

        /**
         * The size in bytes of statically-allocated shared memory required by
         * this function. This does not include dynamically-allocated shared
         * memory requested by the user at runtime.
         */
        CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,

        /**
         * The size in bytes of user-allocated constant memory required by this
         * function.
         */
        CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,

        /**
         * The size in bytes of local memory used by each thread of this function.
         */
        CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

        /**
         * The number of registers used by each thread of this function.
         */
        CU_FUNC_ATTRIBUTE_NUM_REGS = 4,

        /**
         * The PTX virtual architecture version for which the function was
         * compiled. This value is the major PTX version * 10 + the minor PTX
         * version, so a PTX version 1.3 function would return the value 13.
         * Note that this may return the undefined value of 0 for cubins
         * compiled prior to CUDA 3.0.
         */
        CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,

        /**
         * The binary architecture version for which the function was compiled.
         * This value is the major binary version * 10 + the minor binary version,
         * so a binary version 1.3 function would return the value 13. Note that
         * this will return a value of 10 for legacy cubins that do not have a
         * properly-encoded binary architecture version.
         */
        CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,

        CU_FUNC_ATTRIBUTE_MAX
    } CUfunction_attribute;

    /**
     * Function cache configurations
     */
    typedef enum CUfunc_cache_enum {
        CU_FUNC_CACHE_PREFER_NONE   = 0x00, /**< no preference for shared memory or L1 (default) */
        CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
        CU_FUNC_CACHE_PREFER_L1     = 0x02  /**< prefer larger L1 cache and smaller shared memory */
    } CUfunc_cache;

    /**
     * Shared memory configurations
     */
    typedef enum CUsharedconfig_enum {
        CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
        CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
        CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
    } CUsharedconfig;

    /**
     * Memory types
     */
    typedef enum CUmemorytype_enum {
        CU_MEMORYTYPE_HOST   = 0x01, /**< Host memory */
        CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
        CU_MEMORYTYPE_ARRAY  = 0x03  /**< Array memory */
#if __CUDA_API_VERSION >= 4000
        ,
        CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
#endif
    } CUmemorytype;

    /**
     * Compute Modes
     */
    typedef enum CUcomputemode_enum {
        CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
        CU_COMPUTEMODE_PROHIBITED =
            2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
#if __CUDA_API_VERSION >= 4000
        ,
        CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single
                                                process can be present on this device at a time) */
#endif
    } CUcomputemode;

    /**
     * Online compiler options
     */
    typedef enum CUjit_option_enum {
        /**
         * Max number of registers that a thread may use.\n
         * Option type: unsigned int
         */
        CU_JIT_MAX_REGISTERS = 0,

        /**
         * IN: Specifies minimum number of threads per block to target compilation
         * for\n
         * OUT: Returns the number of threads the compiler actually targeted.
         * This restricts the resource utilization fo the compiler (e.g. max
         * registers) such that a block with the given number of threads should be
         * able to launch based on register limitations. Note, this option does not
         * currently take into account any other resource limitations, such as
         * shared memory utilization.\n
         * Option type: unsigned int
         */
        CU_JIT_THREADS_PER_BLOCK,

        /**
         * Returns a float value in the option of the wall clock time, in
         * milliseconds, spent creating the cubin\n
         * Option type: float
         */
        CU_JIT_WALL_TIME,

        /**
         * Pointer to a buffer in which to print any log messsages from PTXAS
         * that are informational in nature (the buffer size is specified via
         * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
         * Option type: char*
         */
        CU_JIT_INFO_LOG_BUFFER,

        /**
         * IN: Log buffer size in bytes.  Log messages will be capped at this size
         * (including null terminator)\n
         * OUT: Amount of log buffer filled with messages\n
         * Option type: unsigned int
         */
        CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,

        /**
         * Pointer to a buffer in which to print any log messages from PTXAS that
         * reflect errors (the buffer size is specified via option
         * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
         * Option type: char*
         */
        CU_JIT_ERROR_LOG_BUFFER,

        /**
         * IN: Log buffer size in bytes.  Log messages will be capped at this size
         * (including null terminator)\n
         * OUT: Amount of log buffer filled with messages\n
         * Option type: unsigned int
         */
        CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,

        /**
         * Level of optimizations to apply to generated code (0 - 4), with 4
         * being the default and highest level of optimizations.\n
         * Option type: unsigned int
         */
        CU_JIT_OPTIMIZATION_LEVEL,

        /**
         * No option value required. Determines the target based on the current
         * attached context (default)\n
         * Option type: No option value needed
         */
        CU_JIT_TARGET_FROM_CUCONTEXT,

        /**
         * Target is chosen based on supplied ::CUjit_target_enum.\n
         * Option type: unsigned int for enumerated type ::CUjit_target_enum
         */
        CU_JIT_TARGET,

        /**
         * Specifies choice of fallback strategy if matching cubin is not found.
         * Choice is based on supplied ::CUjit_fallback_enum.\n
         * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
         */
        CU_JIT_FALLBACK_STRATEGY

    } CUjit_option;

    /**
     * Online compilation targets
     */
    typedef enum CUjit_target_enum {
        CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
        CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
        CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
        CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
        CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
        CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
        CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
        CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
        CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
        CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
        CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
        CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
        CU_TARGET_COMPUTE_70 = 70  /**< Compute device class 7.0.*/
    } CUjit_target;

    /**
     * Cubin matching fallback strategies
     */
    typedef enum CUjit_fallback_enum {
        CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
        CU_PREFER_BINARY   /**< Prefer to fall back to compatible binary code */
    } CUjit_fallback;

    /**
     * Flags to register a graphics resource
     */
    typedef enum CUgraphicsRegisterFlags_enum {
        CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
        CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
        CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
        CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST  = 0x04
    } CUgraphicsRegisterFlags;

    /**
     * Flags for mapping and unmapping interop resources
     */
    typedef enum CUgraphicsMapResourceFlags_enum {
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
    } CUgraphicsMapResourceFlags;

    /**
     * Array indices for cube faces
     */
    typedef enum CUarray_cubemap_face_enum {
        CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
        CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
        CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
        CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
        CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
        CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05  /**< Negative Z face of cubemap */
    } CUarray_cubemap_face;

    /**
     * Limits
     */
    typedef enum CUlimit_enum {
        CU_LIMIT_STACK_SIZE       = 0x00, /**< GPU thread stack size */
        CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
        CU_LIMIT_MALLOC_HEAP_SIZE = 0x02  /**< GPU malloc heap size */
    } CUlimit;

    /**
     * Resource types
     */
    typedef enum CUresourcetype_enum {
        CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
        CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
        CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
        CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
    } CUresourcetype;

    /**
     * Error codes
     */
    typedef enum cudaError_enum {
        /**
         * The API call returned with no errors. In the case of query calls, this
         * can also mean that the operation being queried is complete (see
         * ::cuEventQuery() and ::cuStreamQuery()).
         */
        CUDA_SUCCESS = 0,

        /**
         * This indicates that one or more of the parameters passed to the API call
         * is not within an acceptable range of values.
         */
        CUDA_ERROR_INVALID_VALUE = 1,

        /**
         * The API call failed because it was unable to allocate enough memory to
         * perform the requested operation.
         */
        CUDA_ERROR_OUT_OF_MEMORY = 2,

        /**
         * This indicates that the CUDA driver has not been initialized with
         * ::cuInit() or that initialization has failed.
         */
        CUDA_ERROR_NOT_INITIALIZED = 3,

        /**
         * This indicates that the CUDA driver is in the process of shutting down.
         */
        CUDA_ERROR_DEINITIALIZED = 4,

        /**
         * This indicates profiling APIs are called while application is running
         * in visual profiler mode.
         */
        CUDA_ERROR_PROFILER_DISABLED = 5,
        /**
         * This indicates profiling has not been initialized for this context.
         * Call cuProfilerInitialize() to resolve this.
         */
        CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
        /**
         * This indicates profiler has already been started and probably
         * cuProfilerStart() is incorrectly called.
         */
        CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
        /**
         * This indicates profiler has already been stopped and probably
         * cuProfilerStop() is incorrectly called.
         */
        CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
        /**
         * This indicates that no CUDA-capable devices were detected by the installed
         * CUDA driver.
         */
        CUDA_ERROR_NO_DEVICE = 100,

        /**
         * This indicates that the device ordinal supplied by the user does not
         * correspond to a valid CUDA device.
         */
        CUDA_ERROR_INVALID_DEVICE = 101,


        /**
         * This indicates that the device kernel image is invalid. This can also
         * indicate an invalid CUDA module.
         */
        CUDA_ERROR_INVALID_IMAGE = 200,

        /**
         * This most frequently indicates that there is no context bound to the
         * current thread. This can also be returned if the context passed to an
         * API call is not a valid handle (such as a context that has had
         * ::cuCtxDestroy() invoked on it). This can also be returned if a user
         * mixes different API versions (i.e. 3010 context with 3020 API calls).
         * See ::cuCtxGetApiVersion() for more details.
         */
        CUDA_ERROR_INVALID_CONTEXT = 201,

        /**
         * This indicated that the context being supplied as a parameter to the
         * API call was already the active context.
         * \deprecated
         * This error return is deprecated as of CUDA 3.2. It is no longer an
         * error to attempt to push the active context via ::cuCtxPushCurrent().
         */
        CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,

        /**
         * This indicates that a map or register operation has failed.
         */
        CUDA_ERROR_MAP_FAILED = 205,

        /**
         * This indicates that an unmap or unregister operation has failed.
         */
        CUDA_ERROR_UNMAP_FAILED = 206,

        /**
         * This indicates that the specified array is currently mapped and thus
         * cannot be destroyed.
         */
        CUDA_ERROR_ARRAY_IS_MAPPED = 207,

        /**
         * This indicates that the resource is already mapped.
         */
        CUDA_ERROR_ALREADY_MAPPED = 208,

        /**
         * This indicates that there is no kernel image available that is suitable
         * for the device. This can occur when a user specifies code generation
         * options for a particular CUDA source file that do not include the
         * corresponding device configuration.
         */
        CUDA_ERROR_NO_BINARY_FOR_GPU = 209,

        /**
         * This indicates that a resource has already been acquired.
         */
        CUDA_ERROR_ALREADY_ACQUIRED = 210,

        /**
         * This indicates that a resource is not mapped.
         */
        CUDA_ERROR_NOT_MAPPED = 211,

        /**
         * This indicates that a mapped resource is not available for access as an
         * array.
         */
        CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,

        /**
         * This indicates that a mapped resource is not available for access as a
         * pointer.
         */
        CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,

        /**
         * This indicates that an uncorrectable ECC error was detected during
         * execution.
         */
        CUDA_ERROR_ECC_UNCORRECTABLE = 214,

        /**
         * This indicates that the ::CUlimit passed to the API call is not
         * supported by the active device.
         */
        CUDA_ERROR_UNSUPPORTED_LIMIT = 215,

        /**
         * This indicates that the ::CUcontext passed to the API call can
         * only be bound to a single CPU thread at a time but is already
         * bound to a CPU thread.
         */
        CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,

        /**
         * This indicates that the device kernel source is invalid.
         */
        CUDA_ERROR_INVALID_SOURCE = 300,

        /**
         * This indicates that the file specified was not found.
         */
        CUDA_ERROR_FILE_NOT_FOUND = 301,

        /**
         * This indicates that a link to a shared object failed to resolve.
         */
        CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,

        /**
         * This indicates that initialization of a shared object failed.
         */
        CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,

        /**
         * This indicates that an OS call failed.
         */
        CUDA_ERROR_OPERATING_SYSTEM = 304,


        /**
         * This indicates that a resource handle passed to the API call was not
         * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
         */
        CUDA_ERROR_INVALID_HANDLE = 400,


        /**
         * This indicates that a named symbol was not found. Examples of symbols
         * are global/constant variable names, texture names, and surface names.
         */
        CUDA_ERROR_NOT_FOUND = 500,


        /**
         * This indicates that asynchronous operations issued previously have not
         * completed yet. This result is not actually an error, but must be indicated
         * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
         * may return this value include ::cuEventQuery() and ::cuStreamQuery().
         */
        CUDA_ERROR_NOT_READY = 600,


        /**
         * An exception occurred on the device while executing a kernel. Common
         * causes include dereferencing an invalid device pointer and accessing
         * out of bounds shared memory. The context cannot be used, so it must
         * be destroyed (and a new one should be created). All existing device
         * memory allocations from this context are invalid and must be
         * reconstructed if the program is to continue using CUDA.
         */
        CUDA_ERROR_LAUNCH_FAILED = 700,

        /**
         * This indicates that a launch did not occur because it did not have
         * appropriate resources. This error usually indicates that the user has
         * attempted to pass too many arguments to the device kernel, or the
         * kernel launch specifies too many threads for the kernel's register
         * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
         * when a 32-bit int is expected) is equivalent to passing too many
         * arguments and can also result in this error.
         */
        CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,

        /**
         * This indicates that the device kernel took too long to execute. This can
         * only occur if timeouts are enabled - see the device attribute
         * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
         * context cannot be used (and must be destroyed similar to
         * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
         * this context are invalid and must be reconstructed if the program is to
         * continue using CUDA.
         */
        CUDA_ERROR_LAUNCH_TIMEOUT = 702,

        /**
         * This error indicates a kernel launch that uses an incompatible texturing
         * mode.
         */
        CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,

        /**
         * This error indicates that a call to ::cuCtxEnablePeerAccess() is
         * trying to re-enable peer access to a context which has already
         * had peer access to it enabled.
         */
        CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,

        /**
         * This error indicates that a call to ::cuMemPeerRegister is trying to
         * register memory from a context which has not had peer access
         * enabled yet via ::cuCtxEnablePeerAccess(), or that
         * ::cuCtxDisablePeerAccess() is trying to disable peer access
         * which has not been enabled yet.
         */
        CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,

        /**
         * This error indicates that a call to ::cuMemPeerRegister is trying to
         * register already-registered memory.
         */
        CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,

        /**
         * This error indicates that a call to ::cuMemPeerUnregister is trying to
         * unregister memory that has not been registered.
         */
        CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707,

        /**
         * This error indicates that ::cuCtxCreate was called with the flag
         * ::CU_CTX_PRIMARY on a device which already has initialized its
         * primary context.
         */
        CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,

        /**
         * This error indicates that the context current to the calling thread
         * has been destroyed using ::cuCtxDestroy, or is a primary context which
         * has not yet been initialized.
         */
        CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,

        /**
         * A device-side assert triggered during kernel execution. The context
         * cannot be used anymore, and must be destroyed. All existing device
         * memory allocations from this context are invalid and must be
         * reconstructed if the program is to continue using CUDA.
         */
        CUDA_ERROR_ASSERT = 710,

        /**
         * This error indicates that the hardware resources required to enable
         * peer access have been exhausted for one or more of the devices
         * passed to ::cuCtxEnablePeerAccess().
         */
        CUDA_ERROR_TOO_MANY_PEERS = 711,

        /**
         * This error indicates that the memory range passed to ::cuMemHostRegister()
         * has already been registered.
         */
        CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,

        /**
         * This error indicates that the pointer passed to ::cuMemHostUnregister()
         * does not correspond to any currently registered memory region.
         */
        CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713,

        /**
         * This indicates that an unknown internal error has occurred.
         */
        CUDA_ERROR_UNKNOWN = 999
    } CUresult;

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define CUDA_CB __stdcall
#else
#define CUDA_CB
#endif

    /**
     * CUDA stream callback
     * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
     * \param status ::CUDA_SUCCESS or any persistent error on the stream.
     * \param userData User parameter provided at registration.
     */
    typedef void(CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);

#if __CUDA_API_VERSION >= 4000
/**
 * If set, host memory is portable between CUDA contexts.
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_PORTABLE 0x01

/**
 * If set, host memory is mapped into CUDA address space and
 * ::cuMemHostGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02

/**
 * If set, host memory is allocated as write-combined - fast to write,
 * faster to DMA, slow to read except via SSE4 streaming load instruction
 * (MOVNTDQA).
 * Flag for ::cuMemHostAlloc()
 */
#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04

/**
 * If set, host memory is portable between CUDA contexts.
 * Flag for ::cuMemHostRegister()
 */
#define CU_MEMHOSTREGISTER_PORTABLE 0x01

/**
 * If set, host memory is mapped into CUDA address space and
 * ::cuMemHostGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemHostRegister()
 */
#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02

/**
 * If set, peer memory is mapped into CUDA address space and
 * ::cuMemPeerGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemPeerRegister()
 */
#define CU_MEMPEERREGISTER_DEVICEMAP 0x02
#endif

#if __CUDA_API_VERSION >= 3020

    /**
     * 2D memory copy parameters
     */
    typedef struct CUDA_MEMCPY2D_st
    {
        size_t srcXInBytes; /**< Source X in bytes */
        size_t srcY;        /**< Source Y */

        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
        const void  *srcHost;       /**< Source host pointer */
        CUdeviceptr  srcDevice;     /**< Source device pointer */
        CUarray      srcArray;      /**< Source array reference */
        size_t       srcPitch;      /**< Source pitch (ignored when src is array) */

        size_t dstXInBytes; /**< Destination X in bytes */
        size_t dstY;        /**< Destination Y */

        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
        void        *dstHost;       /**< Destination host pointer */
        CUdeviceptr  dstDevice;     /**< Destination device pointer */
        CUarray      dstArray;      /**< Destination array reference */
        size_t       dstPitch;      /**< Destination pitch (ignored when dst is array) */

        size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
        size_t Height;       /**< Height of 2D memory copy */
    } CUDA_MEMCPY2D;

    /**
     * 3D memory copy parameters
     */
    typedef struct CUDA_MEMCPY3D_st
    {
        size_t       srcXInBytes;   /**< Source X in bytes */
        size_t       srcY;          /**< Source Y */
        size_t       srcZ;          /**< Source Z */
        size_t       srcLOD;        /**< Source LOD */
        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
        const void  *srcHost;       /**< Source host pointer */
        CUdeviceptr  srcDevice;     /**< Source device pointer */
        CUarray      srcArray;      /**< Source array reference */
        void        *reserved0;     /**< Must be NULL */
        size_t       srcPitch;      /**< Source pitch (ignored when src is array) */
        size_t       srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */

        size_t       dstXInBytes;   /**< Destination X in bytes */
        size_t       dstY;          /**< Destination Y */
        size_t       dstZ;          /**< Destination Z */
        size_t       dstLOD;        /**< Destination LOD */
        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
        void        *dstHost;       /**< Destination host pointer */
        CUdeviceptr  dstDevice;     /**< Destination device pointer */
        CUarray      dstArray;      /**< Destination array reference */
        void        *reserved1;     /**< Must be NULL */
        size_t       dstPitch;      /**< Destination pitch (ignored when dst is array) */
        size_t       dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

        size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
        size_t Height;       /**< Height of 3D memory copy */
        size_t Depth;        /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D;

    /**
     * 3D memory cross-context copy parameters
     */
    typedef struct CUDA_MEMCPY3D_PEER_st
    {
        size_t       srcXInBytes;   /**< Source X in bytes */
        size_t       srcY;          /**< Source Y */
        size_t       srcZ;          /**< Source Z */
        size_t       srcLOD;        /**< Source LOD */
        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
        const void  *srcHost;       /**< Source host pointer */
        CUdeviceptr  srcDevice;     /**< Source device pointer */
        CUarray      srcArray;      /**< Source array reference */
        CUcontext    srcContext;    /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
        size_t       srcPitch;      /**< Source pitch (ignored when src is array) */
        size_t       srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */

        size_t       dstXInBytes;   /**< Destination X in bytes */
        size_t       dstY;          /**< Destination Y */
        size_t       dstZ;          /**< Destination Z */
        size_t       dstLOD;        /**< Destination LOD */
        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
        void        *dstHost;       /**< Destination host pointer */
        CUdeviceptr  dstDevice;     /**< Destination device pointer */
        CUarray      dstArray;      /**< Destination array reference */
        CUcontext    dstContext;    /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
        size_t       dstPitch;      /**< Destination pitch (ignored when dst is array) */
        size_t       dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

        size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
        size_t Height;       /**< Height of 3D memory copy */
        size_t Depth;        /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D_PEER;

    /**
     * Array descriptor
     */
    typedef struct CUDA_ARRAY_DESCRIPTOR_st
    {
        size_t Width;  /**< Width of array */
        size_t Height; /**< Height of array */

        CUarray_format Format;      /**< Array format */
        unsigned int   NumChannels; /**< Channels per array element */
    } CUDA_ARRAY_DESCRIPTOR;

    /**
     * 3D array descriptor
     */
    typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
    {
        size_t Width;  /**< Width of 3D array */
        size_t Height; /**< Height of 3D array */
        size_t Depth;  /**< Depth of 3D array */

        CUarray_format Format;      /**< Array format */
        unsigned int   NumChannels; /**< Channels per array element */
        unsigned int   Flags;       /**< Flags */
    } CUDA_ARRAY3D_DESCRIPTOR;

#endif /* __CUDA_API_VERSION >= 3020 */

#if __CUDA_API_VERSION >= 5000
    /**
     * CUDA Resource descriptor
     */
    typedef struct CUDA_RESOURCE_DESC_st
    {
        CUresourcetype resType; /**< Resource type */

        union
        {
            struct
            {
                CUarray hArray; /**< CUDA array */
            } array;
            struct
            {
                CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
            } mipmap;
            struct
            {
                CUdeviceptr    devPtr;      /**< Device pointer */
                CUarray_format format;      /**< Array format */
                unsigned int   numChannels; /**< Channels per array element */
                size_t         sizeInBytes; /**< Size in bytes */
            } linear;
            struct
            {
                CUdeviceptr    devPtr;       /**< Device pointer */
                CUarray_format format;       /**< Array format */
                unsigned int   numChannels;  /**< Channels per array element */
                size_t         width;        /**< Width of the array in elements */
                size_t         height;       /**< Height of the array in elements */
                size_t         pitchInBytes; /**< Pitch between two rows in bytes */
            } pitch2D;
            struct
            {
                int reserved[32];
            } __reserved;
        } res;

        unsigned int flags; /**< Flags (must be zero) */
    } CUDA_RESOURCE_DESC;

    /**
     * Texture descriptor
     */
    typedef struct CUDA_TEXTURE_DESC_st
    {
        CUaddress_mode addressMode[3];      /**< Address modes */
        CUfilter_mode  filterMode;          /**< Filter mode */
        unsigned int   flags;               /**< Flags */
        unsigned int   maxAnisotropy;       /**< Maximum anistropy ratio */
        CUfilter_mode  mipmapFilterMode;    /**< Mipmap filter mode */
        float          mipmapLevelBias;     /**< Mipmap level bias */
        float          minMipmapLevelClamp; /**< Mipmap minimum level clamp */
        float          maxMipmapLevelClamp; /**< Mipmap maximum level clamp */
        int            _reserved[16];
    } CUDA_TEXTURE_DESC;

    /**
     * Resource view format
     */
    typedef enum CUresourceViewFormat_enum {
        CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
        CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
        CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
        CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
        CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
        CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
        CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
        CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
        CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
        CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
        CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
        CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
        CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
        CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
        CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
        CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
        CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
        CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
        CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
        CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
        CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
        CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
        CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
        CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
        CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
        CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
        CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
        CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
        CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
    } CUresourceViewFormat;

    /**
     * Resource view descriptor
     */
    typedef struct CUDA_RESOURCE_VIEW_DESC_st
    {
        CUresourceViewFormat format;           /**< Resource view format */
        size_t               width;            /**< Width of the resource view */
        size_t               height;           /**< Height of the resource view */
        size_t               depth;            /**< Depth of the resource view */
        unsigned int         firstMipmapLevel; /**< First defined mipmap level */
        unsigned int         lastMipmapLevel;  /**< Last defined mipmap level */
        unsigned int         firstLayer;       /**< First layer index */
        unsigned int         lastLayer;        /**< Last layer index */
        unsigned int         _reserved[16];
    } CUDA_RESOURCE_VIEW_DESC;

    /**
     * GPU Direct v3 tokens
     */
    typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st
    {
        unsigned long long p2pToken;
        unsigned int       vaSpaceToken;
    } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
#endif


/**
 * If set, the CUDA array is a collection of layers, where each layer is either a 1D
 * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
 * of layers, not the depth of a 3D array.
 */
#define CUDA_ARRAY3D_LAYERED 0x01

/**
 * Deprecated, use CUDA_ARRAY3D_LAYERED
 */
#define CUDA_ARRAY3D_2DARRAY 0x01

/**
 * This flag must be set in order to bind a surface reference
 * to the CUDA array
 */
#define CUDA_ARRAY3D_SURFACE_LDST 0x02

/**
 * Override the texref format with a format inferred from the array.
 * Flag for ::cuTexRefSetArray()
 */
#define CU_TRSA_OVERRIDE_FORMAT 0x01

/**
 * Read the texture as integers rather than promoting the values to floats
 * in the range [0,1].
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_READ_AS_INTEGER 0x01

/**
 * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_NORMALIZED_COORDINATES 0x02

/**
 * Perform sRGB->linear conversion during texture read.
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_SRGB 0x10

/**
 * End of array terminator for the \p extra parameter to
 * ::cuLaunchKernel
 */
#define CU_LAUNCH_PARAM_END ((void *)0x00)

/**
 * Indicator that the next value in the \p extra parameter to
 * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
 * parameters used for launching kernel \p f.  This buffer needs to
 * honor all alignment/padding requirements of the individual parameters.
 * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
 * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
 * effect.
 */
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *)0x01)

/**
 * Indicator that the next value in the \p extra parameter to
 * ::cuLaunchKernel will be a pointer to a size_t which contains the
 * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
 * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
 * in the \p extra array if the value associated with
 * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
 */
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *)0x02)

/**
 * For texture references loaded into the module, use default texunit from
 * texture reference.
 */
#define CU_PARAM_TR_DEFAULT -1

/**
 * CUDA API made obselete at API version 3020
 */
#if defined(__CUDA_API_VERSION_INTERNAL)
#define CUdeviceptr                CUdeviceptr_v1
#define CUDA_MEMCPY2D_st           CUDA_MEMCPY2D_v1_st
#define CUDA_MEMCPY2D              CUDA_MEMCPY2D_v1
#define CUDA_MEMCPY3D_st           CUDA_MEMCPY3D_v1_st
#define CUDA_MEMCPY3D              CUDA_MEMCPY3D_v1
#define CUDA_ARRAY_DESCRIPTOR_st   CUDA_ARRAY_DESCRIPTOR_v1_st
#define CUDA_ARRAY_DESCRIPTOR      CUDA_ARRAY_DESCRIPTOR_v1
#define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
#define CUDA_ARRAY3D_DESCRIPTOR    CUDA_ARRAY3D_DESCRIPTOR_v1
#endif /* CUDA_FORCE_LEGACY32_INTERNAL */

#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020

    typedef unsigned int CUdeviceptr;

    typedef struct CUDA_MEMCPY2D_st
    {
        unsigned int srcXInBytes;   /**< Source X in bytes */
        unsigned int srcY;          /**< Source Y */
        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
        const void  *srcHost;       /**< Source host pointer */
        CUdeviceptr  srcDevice;     /**< Source device pointer */
        CUarray      srcArray;      /**< Source array reference */
        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */

        unsigned int dstXInBytes;   /**< Destination X in bytes */
        unsigned int dstY;          /**< Destination Y */
        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
        void        *dstHost;       /**< Destination host pointer */
        CUdeviceptr  dstDevice;     /**< Destination device pointer */
        CUarray      dstArray;      /**< Destination array reference */
        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */

        unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
        unsigned int Height;       /**< Height of 2D memory copy */
    } CUDA_MEMCPY2D;

    typedef struct CUDA_MEMCPY3D_st
    {
        unsigned int srcXInBytes;   /**< Source X in bytes */
        unsigned int srcY;          /**< Source Y */
        unsigned int srcZ;          /**< Source Z */
        unsigned int srcLOD;        /**< Source LOD */
        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
        const void  *srcHost;       /**< Source host pointer */
        CUdeviceptr  srcDevice;     /**< Source device pointer */
        CUarray      srcArray;      /**< Source array reference */
        void        *reserved0;     /**< Must be NULL */
        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */

        unsigned int dstXInBytes;   /**< Destination X in bytes */
        unsigned int dstY;          /**< Destination Y */
        unsigned int dstZ;          /**< Destination Z */
        unsigned int dstLOD;        /**< Destination LOD */
        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
        void        *dstHost;       /**< Destination host pointer */
        CUdeviceptr  dstDevice;     /**< Destination device pointer */
        CUarray      dstArray;      /**< Destination array reference */
        void        *reserved1;     /**< Must be NULL */
        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

        unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
        unsigned int Height;       /**< Height of 3D memory copy */
        unsigned int Depth;        /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D;

    typedef struct CUDA_ARRAY_DESCRIPTOR_st
    {
        unsigned int Width;  /**< Width of array */
        unsigned int Height; /**< Height of array */

        CUarray_format Format;      /**< Array format */
        unsigned int   NumChannels; /**< Channels per array element */
    } CUDA_ARRAY_DESCRIPTOR;

    typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
    {
        unsigned int Width;  /**< Width of 3D array */
        unsigned int Height; /**< Height of 3D array */
        unsigned int Depth;  /**< Depth of 3D array */

        CUarray_format Format;      /**< Array format */
        unsigned int   NumChannels; /**< Channels per array element */
        unsigned int   Flags;       /**< Flags */
    } CUDA_ARRAY3D_DESCRIPTOR;

#endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */

/*
 * If set, the CUDA array contains an array of 2D slices
 * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
 * the number of slices, not the depth of a 3D array.
 */
#define CUDA_ARRAY3D_2DARRAY 0x01

/**
 * This flag must be set in order to bind a surface reference
 * to the CUDA array
 */
#define CUDA_ARRAY3D_SURFACE_LDST 0x02

/**
 * Override the texref format with a format inferred from the array.
 * Flag for ::cuTexRefSetArray()
 */
#define CU_TRSA_OVERRIDE_FORMAT 0x01

/**
 * Read the texture as integers rather than promoting the values to floats
 * in the range [0,1].
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_READ_AS_INTEGER 0x01

/**
 * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_NORMALIZED_COORDINATES 0x02

/**
 * Perform sRGB->linear conversion during texture read.
 * Flag for ::cuTexRefSetFlags()
 */
#define CU_TRSF_SRGB 0x10

/**
 * For texture references loaded into the module, use default texunit from
 * texture reference.
 */
#define CU_PARAM_TR_DEFAULT -1

    /** @} */ /* END CUDA_TYPES */

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define CUDAAPI __stdcall
#else
#define CUDAAPI
#endif

    /**
     * \defgroup CUDA_INITIALIZE Initialization
     *
     * This section describes the initialization functions of the low-level CUDA
     * driver application programming interface.
     *
     * @{
     */

    /*********************************
     ** Initialization
     *********************************/
    typedef CUresult CUDAAPI tcuInit(unsigned int Flags);

    /*********************************
     ** Driver Version Query
     *********************************/
    typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);

    /************************************
     **
     **    Device management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
    typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
    typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
    typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
#else
typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
#endif

    typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
    typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
    typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char **pStr);

    /************************************
     **
     **    Context management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
    typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
    typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);

    typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);

    typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
    typedef CUresult CUDAAPI tcuCtxSynchronize(void);


    /************************************
     **
     **    Module management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
    typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
    typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule     *module,
                                                 const void   *image,
                                                 unsigned int  numOptions,
                                                 CUjit_option *options,
                                                 void        **optionValues);
    typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
    typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
    typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
#else
typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
#endif

    typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
    typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);

/************************************
 **
 **    Memory management
 **
 ***********************************/
#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
                                              size_t      *pPitch,
                                              size_t       WidthInBytes,
                                              size_t       Height,
                                              // size of biggest r/w to be performed by kernels on this memory
                                              // 4, 8 or 16 bytes
                                              unsigned int ElementSizeBytes);
#else
typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr  *dptr,
                                          unsigned int *pPitch,
                                          unsigned int  WidthInBytes,
                                          unsigned int  Height,
                                          // size of biggest r/w to be performed by kernels on this memory
                                          // 4, 8 or 16 bytes
                                          unsigned int ElementSizeBytes);
#endif

    typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);

#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
    typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
#else
typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
#endif

    typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
    typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);

    typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);

#if __CUDA_API_VERSION >= 4010
/**
 * Interprocess Handles
 */
#define CU_IPC_HANDLE_SIZE 64

    typedef struct CUipcEventHandle_st
    {
        char reserved[CU_IPC_HANDLE_SIZE];
    } CUipcEventHandle;

    typedef struct CUipcMemHandle_st
    {
        char reserved[CU_IPC_HANDLE_SIZE];
    } CUipcMemHandle;

    typedef enum CUipcMem_flags_enum {
        CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS =
            0x1 /**< Automatically enable peer access between remote devices as needed */
    } CUipcMem_flags;

    typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
    typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
    typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
    typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
    typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
    typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
    typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
#endif

    typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
    typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);
    ;
    typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice,
                                           CUcontext   dstContext,
                                           CUdeviceptr srcDevice,
                                           CUcontext   srcContext,
                                           size_t      ByteCount);

/************************************
 **
 **    Synchronous Memcpy
 **
 ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
 ** but if host memory is involved, they wait until the copy is done before returning.
 **
 ***********************************/

// 1D functions
#if __CUDA_API_VERSION >= 3020
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);

    // device <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);

    // system <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);

    // array <-> array memory
    typedef CUresult CUDAAPI
    tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
#else
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);

// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);

// device <-> array memory
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray      dstArray,
                                       unsigned int dstOffset,
                                       CUdeviceptr  srcDevice,
                                       unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr  dstDevice,
                                       CUarray      srcArray,
                                       unsigned int srcOffset,
                                       unsigned int ByteCount);

// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray      dstArray,
                                       unsigned int dstOffset,
                                       const void  *srcHost,
                                       unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);

// array <-> array memory
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray      dstArray,
                                       unsigned int dstOffset,
                                       CUarray      srcArray,
                                       unsigned int srcOffset,
                                       unsigned int ByteCount);
#endif

    // 2D memcpy

    typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
    typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);

    // 3D memcpy

    typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);

/************************************
 **
 **    Asynchronous Memcpy
 **
 ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
 ** memcpy's done with these functions execute in parallel with the CPU and, if
 ** the hardware is available, may execute in parallel with the GPU.
 ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
 **
 ***********************************/

// 1D functions
#if __CUDA_API_VERSION >= 3020
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
                                                const void *srcHost,
                                                size_t      ByteCount,
                                                CUstream    hStream);
    typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void       *dstHost,
                                                CUdeviceptr srcDevice,
                                                size_t      ByteCount,
                                                CUstream    hStream);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
                                                CUdeviceptr srcDevice,
                                                size_t      ByteCount,
                                                CUstream    hStream);

    // system <-> array memory
    typedef CUresult CUDAAPI
    tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
    typedef CUresult CUDAAPI
    tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);

#else
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr  dstDevice,
                                            const void  *srcHost,
                                            unsigned int ByteCount,
                                            CUstream     hStream);
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void        *dstHost,
                                            CUdeviceptr  srcDevice,
                                            unsigned int ByteCount,
                                            CUstream     hStream);

// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr  dstDevice,
                                            CUdeviceptr  srcDevice,
                                            unsigned int ByteCount,
                                            CUstream     hStream);

// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray      dstArray,
                                            unsigned int dstOffset,
                                            const void  *srcHost,
                                            unsigned int ByteCount,
                                            CUstream     hStream);
typedef CUresult CUDAAPI
tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
#endif

    // 2D memcpy
    typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);

    // 3D memcpy
    typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);

    /************************************
     **
     **    Memset
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
    typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
    typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);

#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI
    tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
    typedef CUresult CUDAAPI
    tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
    typedef CUresult CUDAAPI
    tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
#else
typedef CUresult CUDAAPI
tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr    dstDevice,
                                        unsigned int   dstPitch,
                                        unsigned short us,
                                        unsigned int   Width,
                                        unsigned int   Height);
typedef CUresult CUDAAPI
tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
#endif

    /************************************
     **
     **    Function management
     **
     ***********************************/


    typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
    typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
    typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
    typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
    typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);

    typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction   f,
                                             unsigned int gridDimX,
                                             unsigned int gridDimY,
                                             unsigned int gridDimZ,
                                             unsigned int blockDimX,
                                             unsigned int blockDimY,
                                             unsigned int blockDimZ,
                                             unsigned int sharedMemBytes,
                                             CUstream     hStream,
                                             void       **kernelParams,
                                             void       **extra);

    /************************************
     **
     **    Array management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
    typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
    typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);

    typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
    typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);

#if __CUDA_API_VERSION >= 5000
    typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray              *pHandle,
                                                     const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
                                                     unsigned int                   numMipmapLevels);
    typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray         *pLevelArray,
                                                       CUmipmappedArray hMipmappedArray,
                                                       unsigned int     level);
    typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
#endif


    /************************************
     **
     **    Texture reference management
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
    typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);

    typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);

#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
    typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
                                                   const CUDA_ARRAY_DESCRIPTOR *desc,
                                                   CUdeviceptr                  dptr,
                                                   size_t                       Pitch);
#else
typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset,
                                             CUtexref      hTexRef,
                                             CUdeviceptr   dptr,
                                             unsigned int  bytes);
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
                                               const CUDA_ARRAY_DESCRIPTOR *desc,
                                               CUdeviceptr                  dptr,
                                               unsigned int                 Pitch);
#endif

    typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
    typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
    typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
    typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);

    typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
    typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);

    /************************************
     **
     **    Surface reference management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
    typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);

    /************************************
     **
     **    Parameter management
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
    typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
    typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
    typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
    typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);


    /************************************
     **
     **    Launch functions
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
    typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
    typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);

    /************************************
     **
     **    Events
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
    typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
    typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);

    /************************************
     **
     **    Streams
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
    typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
    typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream         hStream,
                                                  CUstreamCallback callback,
                                                  void            *userData,
                                                  unsigned int     flags);

    typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
    typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
    typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);

    /************************************
     **
     **    Graphics interop
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
    typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray           *pArray,
                                                                  CUgraphicsResource resource,
                                                                  unsigned int       arrayIndex,
                                                                  unsigned int       mipLevel);

#if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
                                                                 size_t            *pSize,
                                                                 CUgraphicsResource resource);
#else
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
                                                             unsigned int      *pSize,
                                                             CUgraphicsResource resource);
#endif

    typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
    typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int        count,
                                                     CUgraphicsResource *resources,
                                                     CUstream            hStream);
    typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int        count,
                                                       CUgraphicsResource *resources,
                                                       CUstream            hStream);

    /************************************
     **
     **    Export tables
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);

    /************************************
     **
     **    Limits
     **
     ***********************************/

    typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
    typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
    typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig);
    typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config);
    typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
    typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config);
    typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version);

    /************************************
     **
     **    Profiler
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuProfilerStop(void);

    /************************************
     ************************************/

    extern CUresult CUDAAPI cuInit(unsigned int, int cudaVersion);

    extern tcuDriverGetVersion        *cuDriverGetVersion;
    extern tcuDeviceGet               *cuDeviceGet;
    extern tcuDeviceGetCount          *cuDeviceGetCount;
    extern tcuDeviceGetName           *cuDeviceGetName;
    extern tcuDeviceComputeCapability *cuDeviceComputeCapability;
    extern tcuDeviceGetProperties     *cuDeviceGetProperties;
    extern tcuDeviceGetAttribute      *cuDeviceGetAttribute;
    extern tcuGetErrorString          *cuGetErrorString;
    extern tcuCtxDestroy              *cuCtxDestroy;
    extern tcuCtxAttach               *cuCtxAttach;
    extern tcuCtxDetach               *cuCtxDetach;
    extern tcuCtxPushCurrent          *cuCtxPushCurrent;
    extern tcuCtxPopCurrent           *cuCtxPopCurrent;

    extern tcuCtxSetCurrent *cuCtxSetCurrent;
    extern tcuCtxGetCurrent *cuCtxGetCurrent;

    extern tcuCtxGetDevice        *cuCtxGetDevice;
    extern tcuCtxSynchronize      *cuCtxSynchronize;
    extern tcuModuleLoad          *cuModuleLoad;
    extern tcuModuleLoadData      *cuModuleLoadData;
    extern tcuModuleLoadDataEx    *cuModuleLoadDataEx;
    extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
    extern tcuModuleUnload        *cuModuleUnload;
    extern tcuModuleGetFunction   *cuModuleGetFunction;
    extern tcuModuleGetTexRef     *cuModuleGetTexRef;
    extern tcuModuleGetSurfRef    *cuModuleGetSurfRef;
    extern tcuMemFreeHost         *cuMemFreeHost;
    extern tcuMemHostAlloc        *cuMemHostAlloc;
    extern tcuMemHostGetFlags     *cuMemHostGetFlags;

    extern tcuMemHostRegister   *cuMemHostRegister;
    extern tcuMemHostUnregister *cuMemHostUnregister;
    extern tcuMemcpy            *cuMemcpy;
    extern tcuMemcpyPeer        *cuMemcpyPeer;

    extern tcuDeviceTotalMem                    *cuDeviceTotalMem;
    extern tcuCtxCreate                         *cuCtxCreate;
    extern tcuModuleGetGlobal                   *cuModuleGetGlobal;
    extern tcuMemGetInfo                        *cuMemGetInfo;
    extern tcuMemAlloc                          *cuMemAlloc;
    extern tcuMemAllocPitch                     *cuMemAllocPitch;
    extern tcuMemFree                           *cuMemFree;
    extern tcuMemGetAddressRange                *cuMemGetAddressRange;
    extern tcuMemAllocHost                      *cuMemAllocHost;
    extern tcuMemHostGetDevicePointer           *cuMemHostGetDevicePointer;
    extern tcuFuncSetBlockShape                 *cuFuncSetBlockShape;
    extern tcuFuncSetSharedSize                 *cuFuncSetSharedSize;
    extern tcuFuncGetAttribute                  *cuFuncGetAttribute;
    extern tcuFuncSetCacheConfig                *cuFuncSetCacheConfig;
    extern tcuFuncSetSharedMemConfig            *cuFuncSetSharedMemConfig;
    extern tcuLaunchKernel                      *cuLaunchKernel;
    extern tcuArrayDestroy                      *cuArrayDestroy;
    extern tcuTexRefCreate                      *cuTexRefCreate;
    extern tcuTexRefDestroy                     *cuTexRefDestroy;
    extern tcuTexRefSetArray                    *cuTexRefSetArray;
    extern tcuTexRefSetFormat                   *cuTexRefSetFormat;
    extern tcuTexRefSetAddressMode              *cuTexRefSetAddressMode;
    extern tcuTexRefSetFilterMode               *cuTexRefSetFilterMode;
    extern tcuTexRefSetFlags                    *cuTexRefSetFlags;
    extern tcuTexRefGetArray                    *cuTexRefGetArray;
    extern tcuTexRefGetAddressMode              *cuTexRefGetAddressMode;
    extern tcuTexRefGetFilterMode               *cuTexRefGetFilterMode;
    extern tcuTexRefGetFormat                   *cuTexRefGetFormat;
    extern tcuTexRefGetFlags                    *cuTexRefGetFlags;
    extern tcuSurfRefSetArray                   *cuSurfRefSetArray;
    extern tcuSurfRefGetArray                   *cuSurfRefGetArray;
    extern tcuParamSetSize                      *cuParamSetSize;
    extern tcuParamSeti                         *cuParamSeti;
    extern tcuParamSetf                         *cuParamSetf;
    extern tcuParamSetv                         *cuParamSetv;
    extern tcuParamSetTexRef                    *cuParamSetTexRef;
    extern tcuLaunch                            *cuLaunch;
    extern tcuLaunchGrid                        *cuLaunchGrid;
    extern tcuLaunchGridAsync                   *cuLaunchGridAsync;
    extern tcuEventCreate                       *cuEventCreate;
    extern tcuEventRecord                       *cuEventRecord;
    extern tcuEventQuery                        *cuEventQuery;
    extern tcuEventSynchronize                  *cuEventSynchronize;
    extern tcuEventDestroy                      *cuEventDestroy;
    extern tcuEventElapsedTime                  *cuEventElapsedTime;
    extern tcuStreamCreate                      *cuStreamCreate;
    extern tcuStreamQuery                       *cuStreamQuery;
    extern tcuStreamWaitEvent                   *cuStreamWaitEvent;
    extern tcuStreamAddCallback                 *cuStreamAddCallback;
    extern tcuStreamSynchronize                 *cuStreamSynchronize;
    extern tcuStreamDestroy                     *cuStreamDestroy;
    extern tcuGraphicsUnregisterResource        *cuGraphicsUnregisterResource;
    extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
    extern tcuGraphicsResourceSetMapFlags       *cuGraphicsResourceSetMapFlags;
    extern tcuGraphicsMapResources              *cuGraphicsMapResources;
    extern tcuGraphicsUnmapResources            *cuGraphicsUnmapResources;
    extern tcuGetExportTable                    *cuGetExportTable;
    extern tcuCtxSetLimit                       *cuCtxSetLimit;
    extern tcuCtxGetLimit                       *cuCtxGetLimit;

    // These functions could be using the CUDA 3.2 interface (_v2)
    extern tcuMemcpyHtoD                       *cuMemcpyHtoD;
    extern tcuMemcpyDtoH                       *cuMemcpyDtoH;
    extern tcuMemcpyDtoD                       *cuMemcpyDtoD;
    extern tcuMemcpyDtoA                       *cuMemcpyDtoA;
    extern tcuMemcpyAtoD                       *cuMemcpyAtoD;
    extern tcuMemcpyHtoA                       *cuMemcpyHtoA;
    extern tcuMemcpyAtoH                       *cuMemcpyAtoH;
    extern tcuMemcpyAtoA                       *cuMemcpyAtoA;
    extern tcuMemcpy2D                         *cuMemcpy2D;
    extern tcuMemcpy2DUnaligned                *cuMemcpy2DUnaligned;
    extern tcuMemcpy3D                         *cuMemcpy3D;
    extern tcuMemcpyHtoDAsync                  *cuMemcpyHtoDAsync;
    extern tcuMemcpyDtoHAsync                  *cuMemcpyDtoHAsync;
    extern tcuMemcpyDtoDAsync                  *cuMemcpyDtoDAsync;
    extern tcuMemcpyHtoAAsync                  *cuMemcpyHtoAAsync;
    extern tcuMemcpyAtoHAsync                  *cuMemcpyAtoHAsync;
    extern tcuMemcpy2DAsync                    *cuMemcpy2DAsync;
    extern tcuMemcpy3DAsync                    *cuMemcpy3DAsync;
    extern tcuMemsetD8                         *cuMemsetD8;
    extern tcuMemsetD16                        *cuMemsetD16;
    extern tcuMemsetD32                        *cuMemsetD32;
    extern tcuMemsetD2D8                       *cuMemsetD2D8;
    extern tcuMemsetD2D16                      *cuMemsetD2D16;
    extern tcuMemsetD2D32                      *cuMemsetD2D32;
    extern tcuArrayCreate                      *cuArrayCreate;
    extern tcuArrayGetDescriptor               *cuArrayGetDescriptor;
    extern tcuArray3DCreate                    *cuArray3DCreate;
    extern tcuArray3DGetDescriptor             *cuArray3DGetDescriptor;
    extern tcuTexRefSetAddress                 *cuTexRefSetAddress;
    extern tcuTexRefSetAddress2D               *cuTexRefSetAddress2D;
    extern tcuTexRefGetAddress                 *cuTexRefGetAddress;
    extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;

    extern tcuMipmappedArrayCreate   *cuMipmappedArrayCreate;
    extern tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
    extern tcuMipmappedArrayDestroy  *cuMipmappedArrayDestroy;

    extern tcuProfilerStop *cuProfilerStop;

#ifdef __cplusplus
}
#endif

// #undef __CUDA_API_VERSION

#endif //__cuda_drvapi_dynlink_cuda_h__