HIPify histogram sample

3 years ago · 1cda62ac16
12 changed files with 38 additions and 25 deletions
--- a/chLib/chError_hip.h
+++ b/chLib/chError_hip.h
@ -70,6 +70,11 @@ template<typename T> hipError_t hipHostAlloc( T **pp, size_t N, unsigned int Fla
				@@ -70,6 +70,11 @@ template<typename T> hipError_t hipHostAlloc( T **pp, size_t N, unsigned int Fla
 template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsigned int Flags ) {
    return hipHostGetDevicePointer( (void **) pp, p, Flags );
 }
+
+template<typename T> hipError_t hipMallocPitch( T **pp, size_t *pPitch, size_t WidthInBytes, size_t Height )
+{
+    return hipMallocPitch( (void **) pp, pPitch, WidthInBytes, Height );
+}
 #endif

 // entry points
@ -79,17 +84,22 @@ template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsign
				@@ -79,17 +84,22 @@ template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsign
 #define cudaHostGetDevicePointer hipHostGetDevicePointer
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaEventDestroy hipEventDestroy
+#define cudaFreeArray hipFreeArray
 #define cudaGetErrorString hipGetErrorString

 // data types
 typedef hipStream_t cudaStream_t;
 typedef hipDeviceProp_t cudaDeviceProp;
+typedef hipArray cudaArray;
+

 // defines
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice

+#define cudaFuncCachePreferShared hipFuncCachePreferShared
+
 #define cudaHostAllocMapped 0
 #define cudaHostAllocPortable 0

--- a/chLib/pgm.cu
+++ b/chLib/pgm.cu
@ -37,6 +37,7 @@
				@@ -37,6 +37,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <chError.h>
 #include "pgm.h"

 int
@ -55,6 +56,7 @@ pgmLoad(
				@@ -55,6 +56,7 @@ pgmLoad(
    unsigned char *idata = NULL;
    unsigned char *ddata = NULL;
    size_t dPitch;
+    cudaError_t status;

    fp = fopen( filename, "rb" );
    if ( fp == NULL) {
@ -88,15 +90,14 @@ pgmLoad(
				@@ -88,15 +90,14 @@ pgmLoad(
        if ( (size_t) w != fread( idata+row*padWidth, 1, w, fp ) )
            goto Error;
    }
-    if ( cudaSuccess != cudaMallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) )
-        goto Error;
+    cuda(MallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) );
    *pWidth = padWidth;
    *pHeight = padHeight;
    *pHostPitch = padWidth;
    *pHostData = idata;
    *pDeviceData = ddata;
    *pDevicePitch = (unsigned int) dPitch;
-    cudaMemcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice );
+    cuda(Memcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice ));
    fclose(fp);
    return 0;
 Error:
--- a/histogram/histogram.cu
+++ b/histogram/histogram.cu
@ -70,7 +70,9 @@ texture<unsigned char, 2> texImage;
				@@ -70,7 +70,9 @@ texture<unsigned char, 2> texImage;
 #include "histogramPerThread4x64.cuh"
 #include "histogramPerThread4x32.cuh"

+#ifndef __HIPCC__
 #include "histogramNPP.cuh"
+#endif

 using namespace cudahandbook::threading;

@ -266,7 +268,7 @@ main(int argc, char *argv[])
				@@ -266,7 +268,7 @@ main(int argc, char *argv[])
    char *outputFilename = NULL;

    cudaArray *pArrayImage = NULL;
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+    //cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();

    {
        g_numCPUCores = processorCount();
@ -370,11 +372,12 @@ main(int argc, char *argv[])
				@@ -370,11 +372,12 @@ main(int argc, char *argv[])
        }
    }

-
+#ifndef __HIPCC__
    cuda(MallocArray( &pArrayImage, &desc, w, h ) );
    cuda(MemcpyToArray( pArrayImage, 0, 0, hidata, w*h, cudaMemcpyHostToDevice ) );
        
    cuda(BindTextureToArray( texImage, pArrayImage ) );
+#endif

    {
        cudaDeviceProp prop;
@ -451,8 +454,9 @@ main(int argc, char *argv[])
				@@ -451,8 +454,9 @@ main(int argc, char *argv[])
        TEST_VECTOR( GPUhistogramPerThread4x32, false, 1, NULL );
        TEST_VECTOR( GPUhistogramPerThread4x32_PeriodicMerge, false, 1, NULL );
    }
-
+#ifndef __HIPCC__
    TEST_VECTOR( GPUhistogramNPP, false, 1, NULL );
+#endif

    ret = 0;
 Error:
--- a/histogram/histogramPerBlock.cuh
+++ b/histogram/histogramPerBlock.cuh
@ -91,8 +91,8 @@ GPUhistogramPerBlock(
				@@ -91,8 +91,8 @@ GPUhistogramPerBlock(
    cudaError_t status;
    cudaEvent_t start = 0, stop = 0;

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(EventRecord( start, 0 ) );
    //histogramPerBlock<<<blocks,threads>>>( pHist, x, y, w, h );
--- a/histogram/histogramPerBlockOffset.cuh
+++ b/histogram/histogramPerBlockOffset.cuh
@ -99,8 +99,8 @@ GPUhistogramPerBlock(
				@@ -99,8 +99,8 @@ GPUhistogramPerBlock(
    cudaError_t status;
    cudaEvent_t start = 0, stop = 0;

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(EventRecord( start, 0 ) );
    //histogramPerBlock<<<blocks,threads>>>( pHist, x, y, w, h );
--- a/histogram/histogramPerBlockReduce.cuh
+++ b/histogram/histogramPerBlockReduce.cuh
@ -97,8 +97,8 @@ GPUhistogramPerBlockReduce(
				@@ -97,8 +97,8 @@ GPUhistogramPerBlockReduce(
    cudaEvent_t start = 0, stop = 0;
    

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(EventRecord( start, 0 ) );
    {
--- a/histogram/histogramPerGrid.cuh
+++ b/histogram/histogramPerGrid.cuh
@ -69,8 +69,8 @@ GPUhistogramPerGrid(
				@@ -69,8 +69,8 @@ GPUhistogramPerGrid(
    cudaError_t status;
    cudaEvent_t start = 0, stop = 0;

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(EventRecord( start, 0 ) );
 //    histogramPerGrid<<<blocks,threads>>>( pHist, w, h );
--- a/histogram/histogramPerThread4x32.cuh
+++ b/histogram/histogramPerThread4x32.cuh
@ -154,8 +154,8 @@ GPUhistogramPerThread4x32(
				@@ -154,8 +154,8 @@ GPUhistogramPerThread4x32(
    int numthreads = threads.x*threads.y;
    int numblocks = bPeriodicMerge ? 256 : INTDIVIDE_CEILING( w*h, numthreads*(255/4) );

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

--- a/histogram/histogramPerThread4x64.cuh
+++ b/histogram/histogramPerThread4x64.cuh
@ -93,8 +93,8 @@ GPUhistogramPerThread4x64(
				@@ -93,8 +93,8 @@ GPUhistogramPerThread4x64(
    int numthreads = threads.x*threads.y;
    int numblocks = bPeriodicMerge ? 256 : INTDIVIDE_CEILING( w*h, numthreads*(255/4) );

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

--- a/histogram/histogramPerThread64.cuh
+++ b/histogram/histogramPerThread64.cuh
@ -108,8 +108,8 @@ GPUhistogramPerThread64(
				@@ -108,8 +108,8 @@ GPUhistogramPerThread64(
    int numthreads = threads.x*threads.y;
    int numblocks = INTDIVIDE_CEILING( w*h, numthreads*255 );

-    cuda(EventCreate( &start, 0 ) );
-    cuda(EventCreate( &stop, 0 ) );
+    cuda(EventCreate( &start ) );
+    cuda(EventCreate( &stop ) );

    cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

--- a/scan/int/scan2Level.cuh
+++ b/scan/int/scan2Level.cuh
@ -168,11 +168,8 @@ scan2Level( T *out, const T *in, size_t N, int b )
				@@ -168,11 +168,8 @@ scan2Level( T *out, const T *in, size_t N, int b )

    cudaError_t status;
    T *gPartials = 0;
-    status = cudaGetSymbolAddress( 
-                (void **) &gPartials, 
-                g_globalPartials );
+    cuda(GetSymbolAddress( (void **) &gPartials, g_globalPartials ));

-    if ( cudaSuccess ==  status )
    {
        //
        // ceil(N/b) = number of partial sums to compute
@ -218,6 +215,7 @@ scan2Level( T *out, const T *in, size_t N, int b )
				@@ -218,6 +215,7 @@ scan2Level( T *out, const T *in, size_t N, int b )
            N, 
            elementsPerPartial );
    }
+Error:;
 }

 template<class T>
--- a/scan/int/testScan.cu
+++ b/scan/int/testScan.cu
@ -163,7 +163,7 @@ main( int argc, char *argv[] )
				@@ -163,7 +163,7 @@ main( int argc, char *argv[] )

    {
        cudaDeviceProp prop;
-        cudaGetDeviceProperties( &prop, 0 );
+        cuda(GetDeviceProperties( &prop, 0 ));
        maxThreads = prop.maxThreadsPerBlock;
    }