Source code that accompanies The CUDA Handbook.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

745 lines
21 KiB

/*
*
* peer2peerTestNUMA.cu
*
* Explore NUMA properties of PCI Express bus hierarchy.
*
* Build with: nvcc -I ../chLib <options> peer2peerTestNUMA.cu
* Requires: No minimum SM requirement.
*
* Copyright (c) 2016, Archaea Software, LLC.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <semaphore.h>
#include <assert.h>
#include <vector>
#include <list>
#include <iostream>
#include <fstream>
#include "chError.h"
#include "chTimer.h"
#include "chNUMA.h"
#define MAX_DEVICES 32
using namespace std;
void *g_hostBuffers[MAX_DEVICES];
// Indexed as follows: [device][event]
bool g_bEnabled[MAX_DEVICES][MAX_DEVICES];
// these are already defined on some platforms - make our
// own definitions that will work.
#undef min
#undef max
#define min(a,b) ((a)<(b)?(a):(b))
#define max(a,b) ((b)<(a)?(a):(b))
#define NUM_ARRAY_ELEMENTS(a) (sizeof(a)/sizeof((a)[0]))
#define CHECK_NONZERO(f) { int ret = (f); if ( 0 != ret) { fprintf(stderr, "%s returned %d (File %s at line %d)\n", #f, ret, __FILE__, __LINE__ ); exit(1); } }
// Grab this mutex before writing output, so it does not get garbled
pthread_mutex_t g_mutexOutput;
// Resources needed for each device
const size_t g_cBytes = 2560ULL*1048576;
size_t g_cIterations = 10;
class cEnumCPUGPU {
public:
// make GPU by default
cEnumCPUGPU( int i, bool bCPU ) { m_i = i; m_bCPU=bCPU; }
friend cEnumCPUGPU makeGPU( int iGPU ) { return cEnumCPUGPU( iGPU, false ); }
friend cEnumCPUGPU makeCPU( int iCPU ) { return cEnumCPUGPU( iCPU, true ); }
bool bGPU() const { return !m_bCPU; }
bool bCPU() const { return m_bCPU; }
int getCPU() const {
assert( m_bCPU );
return m_i;
}
int getGPU() const {
assert( !m_bCPU );
return m_i;
}
friend ostream& operator<<( ostream&, const cEnumCPUGPU& );
private:
bool m_bCPU;
int m_i;
};
inline ostream&
operator<<( ostream& os, const cEnumCPUGPU& e )
{
if ( e.m_bCPU ) {
os << "cpu " << e.m_i;
}
else {
os << "gpu " << e.m_i;
}
return os;
}
//
// Thread context structure
//
class CGPULoadDriver {
public:
virtual ~CGPULoadDriver();
bool TimeMemcpys();
virtual bool PerformMemcpys() = 0;
sem_t m_semWait;
sem_t m_semDone;
static void *ThreadProc( void * );
protected:
CGPULoadDriver( cEnumCPUGPU dstDevice, cEnumCPUGPU srcDevice, cEnumCPUGPU eventDevice, size_t m_cBytes, bool bLatencyTest = false );
size_t m_cBytes;
cudaEvent_t m_evStart;
cudaEvent_t m_evStop;
cEnumCPUGPU m_dstDevice;
cEnumCPUGPU m_srcDevice;
cEnumCPUGPU m_eventDevice;
bool m_bUseEvents;
bool m_bLatencyTest;
};
class CGPUTestP2P : public CGPULoadDriver {
public:
CGPUTestP2P( cEnumCPUGPU dst, cEnumCPUGPU src, size_t cBytes, bool bUseEvents, bool bLatencyTest = false );
virtual ~CGPUTestP2P();
virtual bool PerformMemcpys( );
void *m_dptrDst;
void *m_dptrSrc;
};
class CGPUTestP2PLatency : public CGPUTestP2P {
public:
CGPUTestP2PLatency( cEnumCPUGPU dst, cEnumCPUGPU src, size_t cIterations, bool bUseEvents ) : CGPUTestP2P( dst, src, 4, bUseEvents, true ) {
cudaError_t status;
m_cIterations = cIterations;
cuda(Malloc( &m_dptrDstTimestamps, (cIterations+1)*sizeof(uint64_t) ) );
cuda(Malloc( &m_dptrSrcTimestamps, (cIterations+1)*sizeof(uint64_t) ) );
return;
Error:
fprintf(stderr, "cudaMalloc failed\n" );
exit(1);
}
virtual bool PerformMemcpys( );
private:
size_t m_cIterations;
uint64_t *m_dptrDstTimestamps, *m_dptrSrcTimestamps;
};
class CGPUTestH2D : public CGPULoadDriver {
public:
CGPUTestH2D( cEnumCPUGPU dst, cEnumCPUGPU src, size_t cBytes, bool bUseEvents );
~CGPUTestH2D();
virtual bool PerformMemcpys( );
void *m_dptrDst;
void *m_pSrc;
};
class CGPUTestD2H : public CGPULoadDriver {
public:
CGPUTestD2H( cEnumCPUGPU dst, cEnumCPUGPU src, size_t cBytes, bool bUseEvents );
~CGPUTestD2H();
virtual bool PerformMemcpys( );
void *m_dptrSrc;
void *m_pDst;
};
struct GPUPair {
GPUPair( cEnumCPUGPU dst, cEnumCPUGPU src, bool bLatencyTest ): iDst(dst), iSrc(src), m_bLatencyTest(bLatencyTest) { }
cEnumCPUGPU iDst, iSrc;
bool m_bLatencyTest;
};
ostream&
operator<<( ostream& os, const GPUPair& p )
{
os << p.iDst << " <- " << p.iSrc;
return os;
}
CGPULoadDriver::CGPULoadDriver( cEnumCPUGPU dstDevice, cEnumCPUGPU srcDevice, cEnumCPUGPU eventDevice, size_t cBytes, bool bLatencyTest ):
m_dstDevice( dstDevice ), m_srcDevice( srcDevice ), m_eventDevice( eventDevice ), m_bLatencyTest(bLatencyTest)
{
cudaError_t status;
m_evStart = m_evStop = 0;
m_cBytes = cBytes;
CHECK_NONZERO( sem_init( &m_semWait, 0, 1 ) );
CHECK_NONZERO( sem_init( &m_semDone, 0, 1 ) );
cuda(SetDevice( m_eventDevice.getGPU() ) );
cuda(EventCreate( &m_evStart ) );
cuda(EventCreate( &m_evStop ) );
return;
Error:
cout << "Error creating CGPULoadDriver( " << dstDevice << ", " << srcDevice << endl;
exit(1);
}
CGPULoadDriver *
makeLoadDriver( cEnumCPUGPU dst, cEnumCPUGPU src, size_t bytes, bool bUseEvents , bool bLatencyTest)
{
// CPU<->CPU is not valid
assert( src.bGPU() || dst.bGPU() );
if ( bLatencyTest ) {
assert( src.bGPU() && dst.bGPU() );
return new CGPUTestP2PLatency( dst, src, 100000, bUseEvents );
}
if ( dst.bGPU() && src.bGPU() ) {
return new CGPUTestP2P( dst, src, bytes, bUseEvents );
} else if ( dst.bGPU() && src.bCPU() ) {
return new CGPUTestH2D( dst, src, bytes, bUseEvents );
} else if ( dst.bCPU() && src.bGPU() ) {
return new CGPUTestD2H( dst, src, bytes, bUseEvents );
}
return NULL;
}
CGPUTestP2P::CGPUTestP2P( cEnumCPUGPU dstDevice, cEnumCPUGPU srcDevice, size_t cBytes, bool bUseEvents, bool bLatencyTest ):
CGPULoadDriver( dstDevice, srcDevice, srcDevice, cBytes, bLatencyTest )
{
cudaError_t status;
assert( dstDevice.bGPU() && srcDevice.bGPU() );
m_bUseEvents = bUseEvents;
cuda(SetDevice(m_dstDevice.getGPU() ) );
cuda(Malloc( &m_dptrDst, g_cBytes ) );
cuda(Memset( m_dptrDst, 0, g_cBytes ) );
cuda(SetDevice(m_srcDevice.getGPU() ) );
cuda(Malloc( &m_dptrSrc, g_cBytes ) );
cuda(Memset( m_dptrSrc, 0, g_cBytes ) );
return;
Error:
cerr << "Error creating CGPUTestP2P " << dstDevice << ", " << srcDevice << endl;
exit(1);
}
CGPUTestH2D::CGPUTestH2D( cEnumCPUGPU dstDevice, cEnumCPUGPU srcDevice, size_t cBytes, bool bUseEvents ):
CGPULoadDriver( dstDevice, srcDevice, dstDevice, cBytes )
{
cudaError_t status;
assert( dstDevice.bGPU() && srcDevice.bCPU() );
m_bUseEvents = bUseEvents;
m_dptrDst = 0;
m_pSrc = 0;
cuda(SetDevice(m_dstDevice.getGPU() ) );
cuda(Malloc( &m_dptrDst, m_cBytes ) );
if ( ! chNUMApageAlignedAllocHost( &m_pSrc, m_cBytes, dstDevice.getGPU() ) )
goto Error;
return;
Error:
cerr << "Error creating CGPUTestH2D " << dstDevice << ", " << srcDevice << endl;
exit(1);
}
CGPUTestD2H::CGPUTestD2H( cEnumCPUGPU dstDevice, cEnumCPUGPU srcDevice, size_t cBytes, bool bUseEvents ):
CGPULoadDriver( dstDevice, srcDevice, srcDevice, cBytes )
{
cudaError_t status;
assert( dstDevice.bCPU() && srcDevice.bGPU() );
m_bUseEvents = bUseEvents;
m_pDst = 0;
m_dptrSrc = 0;
cuda(SetDevice(m_srcDevice.getGPU() ) );
cuda(Malloc( &m_dptrSrc, g_cBytes ) );
cuda(MallocHost( &m_pDst, g_cBytes ) );
return;
Error:
cerr << "Error creating CGPUTestD2H " << dstDevice << ", " << srcDevice << endl;
exit(1);
}
CGPULoadDriver::~CGPULoadDriver()
{
sem_destroy( &m_semWait );
sem_destroy( &m_semDone );
cudaSetDevice( m_eventDevice.getGPU() );
cudaEventDestroy( m_evStart );
cudaEventDestroy( m_evStop );
}
CGPUTestP2P::~CGPUTestP2P()
{
if ( m_dptrDst ) {
cudaSetDevice( m_dstDevice.getGPU() );
cudaFree( m_dptrDst );
}
if ( m_dptrSrc ) {
cudaSetDevice( m_srcDevice.getGPU() );
cudaFree( m_dptrSrc );
}
}
CGPUTestH2D::~CGPUTestH2D()
{
if ( m_dptrDst ) {
cudaSetDevice( m_dstDevice.getGPU() );
cudaFree( m_dptrDst );
}
chNUMApageAlignedFreeHost( m_pSrc );
}
CGPUTestD2H::~CGPUTestD2H()
{
if ( m_dptrSrc ) {
cudaSetDevice( m_srcDevice.getGPU() );
cudaFree( m_dptrSrc );
}
cudaFreeHost( m_pDst );
}
bool
CGPUTestP2P::PerformMemcpys( )
{
bool bRet = false;
cudaError_t status;
for ( int j = 0; j < g_cIterations; j++ ) {
cuda(MemcpyPeerAsync( m_dptrDst, m_dstDevice.getGPU(),
m_dptrSrc, m_srcDevice.getGPU(),
g_cBytes, NULL ) );
}
bRet = true;
Error:
return bRet;
}
__global__ void
p2pPingPongLatencyTest(
void *_pLocal,
void *_pRemote,
uint64_t *pTimestamps,
int bWait,
int cIterations )
{
volatile int *pLocal = (volatile int *) _pLocal;
volatile int *pRemote = (volatile int *) _pRemote;
int pingpongValue = 0;
while ( cIterations-- ) {
*pTimestamps++ = clock64();
if ( bWait )
while ( *pLocal != pingpongValue );
bWait = 1;
pingpongValue = 1-pingpongValue;
*pRemote = pingpongValue;
}
}
void
computeStatistics( int32_t *pmin, int32_t *pmax, double *pmean, double *pstdev, uint64_t *pClocks, size_t N )
{
int32_t min = INT_MAX;
int32_t max = INT_MIN;
int64_t sum = 0, sumsq = 0;
size_t cSamples = 0;
for ( size_t i = 10; i < N-1; i++ ) {
int32_t diff = (int32_t) (pClocks[i+1] - pClocks[i]);
sum += diff;
sumsq += (uint64_t) diff*diff;
if ( diff < min ) min = diff;
if ( diff > max ) max = diff;
cSamples += 1;
}
*pmin = min;
*pmax = max;
*pmean = sum / (double) cSamples;
int64_t numerator = (int64_t) cSamples*sumsq - sum*sum;
*pstdev = sqrt(numerator / ((double) cSamples*(cSamples-1)));
}
bool
CGPUTestP2PLatency::PerformMemcpys( )
{
bool bRet = false;
cudaError_t status;
cudaDeviceProp prop;
double srcClockRate, dstClockRate;
uint64_t *phostDst = (uint64_t *) malloc( (m_cIterations+1)*sizeof(uint64_t) );
uint64_t *phostSrc = (uint64_t *) malloc( (m_cIterations+1)*sizeof(uint64_t) );
cuda(SetDevice( m_srcDevice.getGPU() ) );
cuda(GetDeviceProperties( &prop, m_srcDevice.getGPU() ) );
srcClockRate = (double) prop.clockRate;
p2pPingPongLatencyTest<<<1,1>>>( m_dptrSrc, m_dptrDst, m_dptrSrcTimestamps, 1, m_cIterations );
cuda(SetDevice( m_dstDevice.getGPU() ) );
cuda(GetDeviceProperties( &prop, m_dstDevice.getGPU() ) );
dstClockRate = (double) prop.clockRate;
p2pPingPongLatencyTest<<<1,1>>>( m_dptrDst, m_dptrSrc, m_dptrDstTimestamps, 0, m_cIterations );
cuda(DeviceSynchronize() );
cuda(Memcpy( phostDst, m_dptrDstTimestamps, (m_cIterations+1)*sizeof(uint64_t) , cudaMemcpyDeviceToHost ) );
cuda(SetDevice( m_srcDevice.getGPU() ) );
cuda(Memcpy( phostSrc, m_dptrSrc, sizeof(uint64_t) , cudaMemcpyDeviceToHost ) );
cuda(Memcpy( phostSrc, m_dptrDstTimestamps, (m_cIterations+1)*sizeof(uint64_t), cudaMemcpyDeviceToHost ) );
printf( "\nClocks statistics (dst):\n" );
{
int32_t minClocks, maxClocks;
double meanClocks, stdevClocks;
double clockRate = dstClockRate/1e6;
#if 0
for ( int i = 0; i < m_cIterations-1; i++ ) {
int clocks = phostDst[i+1]-phostDst[i];
printf( " %llX\t%d (%d Hz) = %.0f us\n", phostDst[i], clocks, srcClockRate, (double) clocks*1e6 / (double) dstClockRate );
}
#endif
computeStatistics( &minClocks, &maxClocks, &meanClocks, &stdevClocks, phostDst, m_cIterations );
printf( " min: %d clocks (%.0f ns)\n", minClocks, minClocks/clockRate );
printf( " max: %d clocks (%.0f ns)\n", maxClocks, maxClocks/clockRate );
printf( " mean: %.2f clocks (%.0f ns)\n", meanClocks, meanClocks/clockRate );
printf( " stdev: %.2f clocks (%.0f ns)\n", stdevClocks, stdevClocks/clockRate );
}
printf( "Clocks statistics (src):\n" );
{
int32_t minClocks, maxClocks;
double meanClocks, stdevClocks;
double clockRate = srcClockRate/1e6;
#if 0
for ( int i = 0; i < m_cIterations-1; i++ ) {
int clocks = phostSrc[i+1]-phostSrc[i];
printf( " %llX\t%d (%d Hz) = %.0f us\n", phostSrc[i], clocks, dstClockRate, (double) clocks*1e6 / (double) srcClockRate );
}
#endif
computeStatistics( &minClocks, &maxClocks, &meanClocks, &stdevClocks, phostSrc, m_cIterations );
printf( " min: %d clocks (%.0f ns)\n", minClocks, minClocks/clockRate );
printf( " max: %d clocks (%.0f ns)\n", maxClocks, maxClocks/clockRate );
printf( " mean: %.2f clocks (%.0f ns)\n", meanClocks, meanClocks/clockRate );
printf( " stdev: %.2f clocks (%.0f ns)\n", stdevClocks, stdevClocks/clockRate );
}
bRet = true;
Error:
return bRet;
}
bool
CGPUTestH2D::PerformMemcpys( )
{
bool bRet = false;
cudaError_t status;
cuda(SetDevice( m_dstDevice.getGPU() ) );
for ( int j = 0; j < g_cIterations; j++ ) {
cuda(MemcpyAsync( m_dptrDst, m_pSrc, g_cBytes, cudaMemcpyHostToDevice ) );
}
bRet = true;
Error:
return bRet;
}
bool
CGPUTestD2H::PerformMemcpys( )
{
bool bRet = false;
cudaError_t status;
cuda(SetDevice( m_srcDevice.getGPU() ) );
for ( int j = 0; j < g_cIterations; j++ ) {
cuda(MemcpyAsync( m_pDst, m_dptrSrc, g_cBytes, cudaMemcpyDeviceToHost ) );
}
bRet = true;
Error:
return bRet;
}
bool
CGPULoadDriver::TimeMemcpys( )
{
cudaError_t status;
bool bRet = false;
bool bAcquiredMutex = false;
CHECK_NONZERO( sem_wait( &m_semWait ) );
cuda(SetDevice( m_eventDevice.getGPU() ) );
if ( m_bUseEvents ) {
cuda(EventRecord( m_evStart, NULL ) );
}
if ( ! PerformMemcpys() )
goto Error;
if ( m_bUseEvents ) {
cuda(SetDevice( m_eventDevice.getGPU() ) );
cuda(EventRecord( m_evStop, NULL ) );
}
cuda(DeviceSynchronize() );
if ( m_bUseEvents ) {
cuda(SetDevice( m_eventDevice.getGPU() ) );
cuda(EventRecord( m_evStop, NULL ) );
}
cuda(DeviceSynchronize() );
CHECK_NONZERO( pthread_mutex_lock( &g_mutexOutput ) );
bAcquiredMutex = true;
if ( m_bUseEvents && (! m_bLatencyTest) )
{
float ms;
cuda(EventElapsedTime( &ms, m_evStart, m_evStop ) );
double MBytes = g_cIterations*g_cBytes / 1048576.0;
double MBpers = 1000.0*MBytes / ms;
cout << " " << m_dstDevice << " <- " << m_srcDevice << ": " << MBpers << " MB/s (CUDA event)" << endl;
}
bRet = true;
Error:
CHECK_NONZERO( sem_post( &m_semDone ) );
if ( bAcquiredMutex ) {
CHECK_NONZERO( pthread_mutex_unlock( &g_mutexOutput ) );
}
return bRet;
}
void *
CGPULoadDriver::ThreadProc( void *pContext )
{
CGPULoadDriver *p = (CGPULoadDriver *) pContext;
CHECK_NONZERO( sem_wait( &p->m_semWait ) );
p->TimeMemcpys( );
CHECK_NONZERO( sem_post( &p->m_semDone ) );
return NULL;
}
bool
LaunchMemcpys_threaded( vector<GPUPair> pairs, size_t cBytes, bool bUseEvents )
{
int cPairs = pairs.size();
chTimerTimestamp start, stop;
pthread_t *threads = new pthread_t[cPairs];
vector< CGPULoadDriver * > tests( cPairs );
for ( int i = 0; i < cPairs; i++ ) {
// CPU to CPU transfers not supported
if ( pairs[i].iSrc.bCPU() && pairs[i].iDst.bCPU() ) {
return false;
}
tests[i] = makeLoadDriver( pairs[i].iDst, pairs[i].iSrc, cBytes, bUseEvents, pairs[i].m_bLatencyTest );
}
for ( int i = 0; i < cPairs; i++ ) {
CHECK_NONZERO( pthread_create( &threads[i], NULL, CGPULoadDriver::ThreadProc, (void *) tests[i] ) );
}
sleep(1); // let the threads get a chance to hit the semaphore wait
chTimerGetTime( &start );
for ( int i = 0; i < cPairs; i++ ) {
CHECK_NONZERO( sem_post( &tests[i]->m_semWait ) );
}
for ( int i = 0; i < cPairs; i++ ) {
CHECK_NONZERO( sem_wait( &tests[i]->m_semDone ) );
}
for ( int i = 0; i < cPairs; i++ ) {
pthread_join( threads[i], NULL );
}
chTimerGetTime( &stop );
{
int cActivePairs = 0;
for ( int i = 0; i < cPairs; i++ ) {
cActivePairs += ! pairs[i].m_bLatencyTest;
}
double TotalMBytes = (double) cActivePairs * g_cBytes * g_cIterations / 1e6;
if ( cActivePairs != 0 ) {
double ElapsedTime = chTimerElapsedTime( &start, &stop );
printf( " Wall clock total observed bandwidth%s: %.0f MB/s\n", bUseEvents?"":" (no events)", TotalMBytes/ElapsedTime );
}
}
for ( int i = 0; i < cPairs; i++ ) {
delete tests[i];
}
return true;
}
bool
RunTest( vector<GPUPair> pairs, size_t cBytes, const char *szTestName )
{
cout << szTestName << " test:" << endl;;
for ( int i = 0; i < pairs.size(); i++ ) {
cout << " " << pairs[i] << endl;
}
if ( ! LaunchMemcpys_threaded( pairs, cBytes, true ) ) goto Error;
// the bool says whether to use events.
// if ( ! LaunchMemcpys_threaded( pairs, cBytes, false ) ) goto Error;
return true;
Error:
return false;
}
vector<GPUPair>
ReadConfigFile( const char *s )
{
vector<GPUPair> v;
ifstream cfgfile(s);
if ( ! cfgfile ) {
fprintf( stderr, "Could not open %s\n", s );
exit(1);
}
while ( cfgfile ) {
bool bSrcCPU, bDstCPU;
int srcDevice, dstDevice;
string deviceType;
bool bLatencyTest = false;
cfgfile >> deviceType;
if ( deviceType=="latency" ) {
bLatencyTest = true;
cfgfile >> deviceType;
}
cfgfile >> dstDevice;
if ( cfgfile.eof() )
break;
bDstCPU = false;
if ( deviceType=="cpu" ) {
if ( bLatencyTest ) {
fprintf( stderr, "Latency test must be between GPUs\n" );
exit(1);
}
bDstCPU = true;
}
else if ( deviceType != "gpu") {
fprintf( stderr, "Parsing error (not cpu or gpu)\n" );
exit(1);
}
{
string check;
cfgfile >> check;
if ( check != "<-" ) {
fprintf( stderr, "Parsing error (missing <- symbol)\n" );
exit(1);
}
}
cfgfile >> deviceType >> srcDevice;
bSrcCPU = false;
if ( deviceType=="cpu" ) {
if ( bLatencyTest ) {
fprintf( stderr, "Latency test must be between GPUs\n" );
exit(1);
}
bSrcCPU = true;
}
else if ( deviceType != "gpu") {
fprintf( stderr, "Parsing error (not cpu or gpu)\n" );
exit(1);
}
v.push_back( GPUPair( cEnumCPUGPU(dstDevice, bDstCPU), cEnumCPUGPU( srcDevice, bSrcCPU), bLatencyTest ) );
}
return v;
}
int
main( int argc, char *argv[] )
{
int deviceCount;
cudaError_t status;
printf( "Peer-to-peer memcpy... " ); fflush( stdout );
cuda(GetDeviceCount( &deviceCount ) );
if ( deviceCount <= 1 ) {
printf( "Peer-to-peer demo requires at least 2 devices\n" );
exit(1);
}
printf( "%d devices detected\n", deviceCount );
pthread_mutex_init( &g_mutexOutput, NULL );
for ( int i = 0; i < deviceCount; i++ ) {
cudaSetDevice( i );
for ( int j = 0; j < deviceCount; j++ ) {
if ( i != j ) {
int bEnabled;
cuda(DeviceCanAccessPeer( &bEnabled, i, j ) );
g_bEnabled[i][j] = (0 != bEnabled);
if ( bEnabled ) {
cuda(DeviceEnablePeerAccess( j, 0 ) );
}
}
}
}
if ( 2 != argc ) {
fprintf( stderr, "Usage: %s <configfile>\n", argv[0] );
exit(1);
}
{
vector<GPUPair> v = ReadConfigFile( argv[1] );
RunTest( v, g_cBytes, argv[1] );
}
return 0;
Error:
printf( "Error\n" );
return 1;
}