Update a few apps to use HIP

4 years ago · eafe3b9439
5 changed files with 77 additions and 41 deletions
--- a/chLib/chError.h
+++ b/chLib/chError.h
@ -39,7 +39,6 @@
				@@ -39,7 +39,6 @@
 *
 */

-
 #ifndef __CHERROR_H__
 #define __CHERROR_H__

@ -47,15 +46,27 @@
				@@ -47,15 +46,27 @@
 #include <stdio.h>
 #endif

-#include <chCUDA.h>
+#ifdef __HIPCC__
+
+#include <hip/hip_runtime.h>
+#define cuda( fn ) do { \
+	            status = (hip##fn); \
+	            if ( hipSuccess != (status) ) { \
+			                                    goto Error; \
+			                                } \
+	            } while (0);
+#else

 #ifndef NO_CUDA

+#include <chCUDA.h>
+
 template<typename T>
 inline const char *
 chGetErrorString( T status )
 {
-    return cudaGetErrorString(status);
+    return hipGetErrorString(status);
+    //return cudaGetErrorString(status);
 }

 template<>
@ -163,7 +174,7 @@ chGetErrorString( CUresult status )
				@@ -163,7 +174,7 @@ chGetErrorString( CUresult status )
 #ifdef DEBUG
 #define CUDART_CHECK( fn ) do { \
        (status) =  (fn); \
-        if ( cudaSuccess != (status) ) { \
+        if ( hipSuccess != (status) ) { \
            fprintf( stderr, "CUDA Runtime Failure (line %d of file %s):\n\t" \
                "%s returned 0x%x (%s)\n", \
                __LINE__, __FILE__, #fn, status, chGetErrorString(status) ); \
@ -172,9 +183,9 @@ chGetErrorString( CUresult status )
				@@ -172,9 +183,9 @@ chGetErrorString( CUresult status )
    } while (0);

 #define cuda( fn ) do { \
-        (status) =  (cuda##fn); \
-        if ( cudaSuccess != (status) ) { \
-            fprintf( stderr, "CUDA Runtime Failure (line %d of file %s):\n\t" \
+        (status) =  (hip##fn); \
+        if ( hipSuccess != (status) ) { \
+            fprintf( stderr, "HIP Runtime Failure (line %d of file %s):\n\t" \
                "%s returned 0x%x (%s)\n", \
                __LINE__, __FILE__, #fn, status, chGetErrorString(status) ); \
            goto Error; \
@ -182,7 +193,7 @@ chGetErrorString( CUresult status )
				@@ -182,7 +193,7 @@ chGetErrorString( CUresult status )
    } while (0);

 #define cu( fn ) do { \
-        (status) =  (cu##fn); \
+        (status) =  (hip##fn); \
        if ( CUDA_SUCCESS != (status) ) { \
            fprintf( stderr, "CUDA Runtime Failure (line %d of file %s):\n\t%s "\
                "returned 0x%x (%s)\n", \
@ -203,17 +214,16 @@ chGetErrorString( CUresult status )
				@@ -203,17 +214,16 @@ chGetErrorString( CUresult status )

 #else

-
 #define CUDART_CHECK( fn ) do { \
    status = (fn); \
-    if ( cudaSuccess != (status) ) { \
+    if ( hipSuccess != (status) ) { \
 	    goto Error; \
 	} \
    } while (0);

 #define cuda( fn ) do { \
-    status = (cuda##fn); \
-    if ( cudaSuccess != (status) ) { \
+    status = (hip##fn); \
+    if ( hipSuccess != (status) ) { \
 	    goto Error; \
 	} \
    } while (0);
@ -244,14 +254,14 @@ chGetErrorString( T status )
				@@ -244,14 +254,14 @@ chGetErrorString( T status )
    return "CUDA support is not built in.";
 }

-static inline const char* cudaGetErrorString( cudaError_t error )
-{
-	return "CUDA support is not built in.";
-}
+//static inline const char* cudaGetErrorString( hipError_t error )
+//{
+//	return "CUDA support is not built in.";
+//}

 #define CUDART_CHECK( fn ) do { \
    status = (fn); \
-    if ( cudaSuccess != (status) ) { \
+    if ( hipSuccess != (status) ) { \
            goto Error; \
        } \
    } while (0);
@ -266,3 +276,6 @@ static inline const char* cudaGetErrorString( cudaError_t error )
				@@ -266,3 +276,6 @@ static inline const char* cudaGetErrorString( cudaError_t error )
 #endif

 #endif
+
+#endif // __HIPCC__
+
--- a/concurrency/breakevenHtoDMemcpy.cu
+++ b/concurrency/breakevenHtoDMemcpy.cu
@ -38,13 +38,17 @@
				@@ -38,13 +38,17 @@

 #include <stdio.h>

+#ifdef __HIPCC__
+
+#endif
+
 #include "chError.h"
 #include "chTimer.h"

 int
 main( int argc, char *argv[] )
 {
-    cudaError_t status;
+    hipError_t status;
    int *deviceInt = 0;
    int *hostInt = 0;
    const int cIterations = 100000;
@ -58,7 +62,7 @@ main( int argc, char *argv[] )
				@@ -58,7 +62,7 @@ main( int argc, char *argv[] )
    chTimerTimestamp start, stop;

    cuda(Malloc( &deviceInt, numBytes ) );
-    cuda(HostAlloc( &hostInt, numBytes, 0 ) );
+    cuda(HostMalloc( (void **) &hostInt, numBytes, 0 ) );

    for ( size_t byteCount = byteIncrement; 
          byteCount <= numBytes; 
@ -68,7 +72,7 @@ main( int argc, char *argv[] )
				@@ -68,7 +72,7 @@ main( int argc, char *argv[] )
        chTimerGetTime( &start );
        for ( int i = 0; i < cIterations; i++ ) {
            cuda(MemcpyAsync( deviceInt, hostInt, byteCount, 
-                cudaMemcpyHostToDevice, NULL ) );
+                hipMemcpyHostToDevice, NULL ) );
        }
        cuda(DeviceSynchronize() );
        chTimerGetTime( &stop );
@ -81,8 +85,8 @@ main( int argc, char *argv[] )
				@@ -81,8 +85,8 @@ main( int argc, char *argv[] )
    }


-    cudaFree( deviceInt );
-    cudaFreeHost( hostInt );
+    hipFree( deviceInt );
+    hipHostFree( hostInt );
    return 0;
 Error:
    printf( "Error performing allocation\n" );
--- a/concurrency/nullKernelAsync.cu
+++ b/concurrency/nullKernelAsync.cu
@ -38,9 +38,18 @@
				@@ -38,9 +38,18 @@

 #include <stdio.h>

-#include "chError.h"
+//#include "chError.h"
 #include "chTimer.h"

+#include <hip/hip_runtime.h>
+
+#define cuda( fn ) do { \
+	    status = (hip##fn); \
+	    if ( hipSuccess != (status) ) { \
+		                goto Error; \
+		            } \
+	    } while (0);
+
 __global__
 void
 NullKernel()
@ -50,7 +59,7 @@ NullKernel()
				@@ -50,7 +59,7 @@ NullKernel()
 double
 usPerLaunch( int cIterations )
 {
-    cudaError_t status;
+    hipError_t status;
    double microseconds, ret;
    chTimerTimestamp start, stop;

--- a/concurrency/nullKernelSync.cu
+++ b/concurrency/nullKernelSync.cu
@ -38,7 +38,17 @@
				@@ -38,7 +38,17 @@

 #include <stdio.h>

-#include "chError.h"
+//#include "chError.h"
+
+#include <hip/hip_runtime.h>
+
+#define cuda( fn ) do { \
+	            status = (hip##fn); \
+	            if ( hipSuccess != (status) ) { \
+			                                    goto Error; \
+			                                } \
+	            } while (0);
+
 #include "chTimer.h"

 __global__
@ -50,11 +60,11 @@ NullKernel()
				@@ -50,11 +60,11 @@ NullKernel()
 double
 usPerLaunch( int cIterations )
 {
-    cudaError_t status;
+    hipError_t status;
    double microseconds, ret;
    chTimerTimestamp start, stop;

-    cuda(Free(0) );
+    hipFree(0);//(Free(0) );

    chTimerGetTime( &start );
    for ( int i = 0; i < cIterations; i++ ) {
@ -67,7 +77,7 @@ usPerLaunch( int cIterations )
				@@ -67,7 +77,7 @@ usPerLaunch( int cIterations )
    ret = microseconds / (float) cIterations;

 Error:
-    return (status) ? 0.0 : ret;
+    return ret;//return (status) ? 0.0 : ret;
 }

 int
--- a/microbench/globalRead.cu
+++ b/microbench/globalRead.cu
@ -131,10 +131,10 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
				@@ -131,10 +131,10 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
    double elapsedTime;
    float ms;
    int cIterations;
-    cudaError_t status;
+    hipError_t status;
    T sumCPU;
-    cudaEvent_t evStart = 0;
-    cudaEvent_t evStop = 0;
+    hipEvent_t evStart = 0;
+    hipEvent_t evStop = 0;

    cuda(Malloc( &in, N*sizeof(T) ) );
    cuda(Malloc( &out, cBlocks*cThreads*sizeof(T) ) );
@ -154,14 +154,14 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
				@@ -154,14 +154,14 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
        hostIn[i] = nextrand;
    }

-    cuda(Memcpy( in, hostIn, N*sizeof(T), cudaMemcpyHostToDevice ) );
+    cuda(Memcpy( in, hostIn, N*sizeof(T), hipMemcpyHostToDevice ) );
    cuda(EventCreate( &evStart ) );
    cuda(EventCreate( &evStop ) );

    {
        // confirm that kernel launch with this configuration writes correct result
        GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, true );
-        cuda(Memcpy( hostOut, out, cBlocks*cThreads*sizeof(T), cudaMemcpyDeviceToHost ) );
+        cuda(Memcpy( hostOut, out, cBlocks*cThreads*sizeof(T), hipMemcpyDeviceToHost ) );
        cuda(GetLastError() ); 
        T sumGPU = T(0);
        for ( size_t i = 0; i < cBlocks*cThreads; i++ ) {
@ -174,11 +174,11 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
				@@ -174,11 +174,11 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
    }

    cIterations = 10;
-    cudaEventRecord( evStart );
+    cuda(EventRecord( evStart ) );
    for ( int i = 0; i < cIterations; i++ ) {
        GlobalReads<T,n><<<cBlocks,cThreads>>>( out, in+bOffset, N-bOffset, false );
    }
-    cudaEventRecord( evStop );
+    cuda(EventRecord( evStop ) );
    cuda(DeviceSynchronize() );
    // make configurations that cannot launch error-out with 0 bandwidth
    cuda(GetLastError() ); 
@ -193,10 +193,10 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
				@@ -193,10 +193,10 @@ BandwidthReads( size_t N, int cBlocks, int cThreads )
 Error:
    if ( hostIn ) delete[] hostIn;
    if ( hostOut ) delete[] hostOut;
-    cudaEventDestroy( evStart );
-    cudaEventDestroy( evStop );
-    cudaFree( in );
-    cudaFree( out );
+    hipEventDestroy( evStart );
+    hipEventDestroy( evStop );
+    hipFree( in );
+    hipFree( out );
    return ret;
 }

@ -252,10 +252,10 @@ Shmoo( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks )
				@@ -252,10 +252,10 @@ Shmoo( size_t N, size_t threadStart, size_t threadStop, size_t cBlocks )
 int
 main( int argc, char *argv[] )
 {
-    cudaError_t status;
+    hipError_t status;
    int device = 0;
    int size = 16;
-    cudaDeviceProp prop;
+    hipDeviceProp_t prop;
    if ( chCommandLineGet( &device, "device", argc, argv ) ) {
        printf( "Using device %d...\n", device );
    }