Browse Source

HIPify histogram sample

get-last-error-is-asynchronous
Nicholas Wilt 3 years ago
parent
commit
1cda62ac16
  1. 10
      chLib/chError_hip.h
  2. 7
      chLib/pgm.cu
  3. 10
      histogram/histogram.cu
  4. 4
      histogram/histogramPerBlock.cuh
  5. 4
      histogram/histogramPerBlockOffset.cuh
  6. 4
      histogram/histogramPerBlockReduce.cuh
  7. 4
      histogram/histogramPerGrid.cuh
  8. 4
      histogram/histogramPerThread4x32.cuh
  9. 4
      histogram/histogramPerThread4x64.cuh
  10. 4
      histogram/histogramPerThread64.cuh
  11. 6
      scan/int/scan2Level.cuh
  12. 2
      scan/int/testScan.cu

10
chLib/chError_hip.h

@ -70,6 +70,11 @@ template<typename T> hipError_t hipHostAlloc( T **pp, size_t N, unsigned int Fla @@ -70,6 +70,11 @@ template<typename T> hipError_t hipHostAlloc( T **pp, size_t N, unsigned int Fla
template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsigned int Flags ) {
return hipHostGetDevicePointer( (void **) pp, p, Flags );
}
template<typename T> hipError_t hipMallocPitch( T **pp, size_t *pPitch, size_t WidthInBytes, size_t Height )
{
return hipMallocPitch( (void **) pp, pPitch, WidthInBytes, Height );
}
#endif
// entry points
@ -79,17 +84,22 @@ template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsign @@ -79,17 +84,22 @@ template<typename T> hipError_t hipHostGetDevicePointer( T **pp, void *p, unsign
#define cudaHostGetDevicePointer hipHostGetDevicePointer
#define cudaStreamDestroy hipStreamDestroy
#define cudaEventDestroy hipEventDestroy
#define cudaFreeArray hipFreeArray
#define cudaGetErrorString hipGetErrorString
// data types
typedef hipStream_t cudaStream_t;
typedef hipDeviceProp_t cudaDeviceProp;
typedef hipArray cudaArray;
// defines
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaFuncCachePreferShared hipFuncCachePreferShared
#define cudaHostAllocMapped 0
#define cudaHostAllocPortable 0

7
chLib/pgm.cu

@ -37,6 +37,7 @@ @@ -37,6 +37,7 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <chError.h>
#include "pgm.h"
int
@ -55,6 +56,7 @@ pgmLoad( @@ -55,6 +56,7 @@ pgmLoad(
unsigned char *idata = NULL;
unsigned char *ddata = NULL;
size_t dPitch;
cudaError_t status;
fp = fopen( filename, "rb" );
if ( fp == NULL) {
@ -88,15 +90,14 @@ pgmLoad( @@ -88,15 +90,14 @@ pgmLoad(
if ( (size_t) w != fread( idata+row*padWidth, 1, w, fp ) )
goto Error;
}
if ( cudaSuccess != cudaMallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) )
goto Error;
cuda(MallocPitch( (void **) &ddata, &dPitch, padWidth, padHeight ) );
*pWidth = padWidth;
*pHeight = padHeight;
*pHostPitch = padWidth;
*pHostData = idata;
*pDeviceData = ddata;
*pDevicePitch = (unsigned int) dPitch;
cudaMemcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice );
cuda(Memcpy2D( ddata, dPitch, idata, padWidth, padWidth, padHeight, cudaMemcpyHostToDevice ));
fclose(fp);
return 0;
Error:

10
histogram/histogram.cu

@ -70,7 +70,9 @@ texture<unsigned char, 2> texImage; @@ -70,7 +70,9 @@ texture<unsigned char, 2> texImage;
#include "histogramPerThread4x64.cuh"
#include "histogramPerThread4x32.cuh"
#ifndef __HIPCC__
#include "histogramNPP.cuh"
#endif
using namespace cudahandbook::threading;
@ -266,7 +268,7 @@ main(int argc, char *argv[]) @@ -266,7 +268,7 @@ main(int argc, char *argv[])
char *outputFilename = NULL;
cudaArray *pArrayImage = NULL;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
//cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
{
g_numCPUCores = processorCount();
@ -370,11 +372,12 @@ main(int argc, char *argv[]) @@ -370,11 +372,12 @@ main(int argc, char *argv[])
}
}
#ifndef __HIPCC__
cuda(MallocArray( &pArrayImage, &desc, w, h ) );
cuda(MemcpyToArray( pArrayImage, 0, 0, hidata, w*h, cudaMemcpyHostToDevice ) );
cuda(BindTextureToArray( texImage, pArrayImage ) );
#endif
{
cudaDeviceProp prop;
@ -451,8 +454,9 @@ main(int argc, char *argv[]) @@ -451,8 +454,9 @@ main(int argc, char *argv[])
TEST_VECTOR( GPUhistogramPerThread4x32, false, 1, NULL );
TEST_VECTOR( GPUhistogramPerThread4x32_PeriodicMerge, false, 1, NULL );
}
#ifndef __HIPCC__
TEST_VECTOR( GPUhistogramNPP, false, 1, NULL );
#endif
ret = 0;
Error:

4
histogram/histogramPerBlock.cuh

@ -91,8 +91,8 @@ GPUhistogramPerBlock( @@ -91,8 +91,8 @@ GPUhistogramPerBlock(
cudaError_t status;
cudaEvent_t start = 0, stop = 0;
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(EventRecord( start, 0 ) );
//histogramPerBlock<<<blocks,threads>>>( pHist, x, y, w, h );

4
histogram/histogramPerBlockOffset.cuh

@ -99,8 +99,8 @@ GPUhistogramPerBlock( @@ -99,8 +99,8 @@ GPUhistogramPerBlock(
cudaError_t status;
cudaEvent_t start = 0, stop = 0;
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(EventRecord( start, 0 ) );
//histogramPerBlock<<<blocks,threads>>>( pHist, x, y, w, h );

4
histogram/histogramPerBlockReduce.cuh

@ -97,8 +97,8 @@ GPUhistogramPerBlockReduce( @@ -97,8 +97,8 @@ GPUhistogramPerBlockReduce(
cudaEvent_t start = 0, stop = 0;
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(EventRecord( start, 0 ) );
{

4
histogram/histogramPerGrid.cuh

@ -69,8 +69,8 @@ GPUhistogramPerGrid( @@ -69,8 +69,8 @@ GPUhistogramPerGrid(
cudaError_t status;
cudaEvent_t start = 0, stop = 0;
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(EventRecord( start, 0 ) );
// histogramPerGrid<<<blocks,threads>>>( pHist, w, h );

4
histogram/histogramPerThread4x32.cuh

@ -154,8 +154,8 @@ GPUhistogramPerThread4x32( @@ -154,8 +154,8 @@ GPUhistogramPerThread4x32(
int numthreads = threads.x*threads.y;
int numblocks = bPeriodicMerge ? 256 : INTDIVIDE_CEILING( w*h, numthreads*(255/4) );
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

4
histogram/histogramPerThread4x64.cuh

@ -93,8 +93,8 @@ GPUhistogramPerThread4x64( @@ -93,8 +93,8 @@ GPUhistogramPerThread4x64(
int numthreads = threads.x*threads.y;
int numblocks = bPeriodicMerge ? 256 : INTDIVIDE_CEILING( w*h, numthreads*(255/4) );
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

4
histogram/histogramPerThread64.cuh

@ -108,8 +108,8 @@ GPUhistogramPerThread64( @@ -108,8 +108,8 @@ GPUhistogramPerThread64(
int numthreads = threads.x*threads.y;
int numblocks = INTDIVIDE_CEILING( w*h, numthreads*255 );
cuda(EventCreate( &start, 0 ) );
cuda(EventCreate( &stop, 0 ) );
cuda(EventCreate( &start ) );
cuda(EventCreate( &stop ) );
cuda(Memset( pHist, 0, 256*sizeof(unsigned int) ) );

6
scan/int/scan2Level.cuh

@ -168,11 +168,8 @@ scan2Level( T *out, const T *in, size_t N, int b ) @@ -168,11 +168,8 @@ scan2Level( T *out, const T *in, size_t N, int b )
cudaError_t status;
T *gPartials = 0;
status = cudaGetSymbolAddress(
(void **) &gPartials,
g_globalPartials );
cuda(GetSymbolAddress( (void **) &gPartials, g_globalPartials ));
if ( cudaSuccess == status )
{
//
// ceil(N/b) = number of partial sums to compute
@ -218,6 +215,7 @@ scan2Level( T *out, const T *in, size_t N, int b ) @@ -218,6 +215,7 @@ scan2Level( T *out, const T *in, size_t N, int b )
N,
elementsPerPartial );
}
Error:;
}
template<class T>

2
scan/int/testScan.cu

@ -163,7 +163,7 @@ main( int argc, char *argv[] ) @@ -163,7 +163,7 @@ main( int argc, char *argv[] )
{
cudaDeviceProp prop;
cudaGetDeviceProperties( &prop, 0 );
cuda(GetDeviceProperties( &prop, 0 ));
maxThreads = prop.maxThreadsPerBlock;
}

Loading…
Cancel
Save