/* * * win32KernelStreamCallback.cu * * Microdemo to examine the behavior of stream callbacks on Win32. * * Periodically counts the number of CPU threads extant, and reports * that number. * * Build with: nvcc -I ../chLib win32KernelStreamCallback.cu * Requires: No minimum SM requirement. * * Copyright (c) 2011-2012, Archaea Software, LLC. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #include #include #include #include #include __global__ void NullKernel() { } __global__ void DereferenceNullKernel() { *((volatile int *) 0) = 0xdeadbeef; } class CStreamCallbacksStats { private: CRITICAL_SECTION m_cs; int m_cCallbacks; int m_intervalCount; int m_totalCallbacks; DWORD m_dwThreadId; HANDLE m_hEvent; // event to signal int m_signalCount; // number of callbacks to look for before signaling event int m_maxThreadCount; static const int intervalPeriod = 1000; public: CStreamCallbacksStats() { InitializeCriticalSection( &m_cs ); m_hEvent = NULL; m_signalCount = 0; m_intervalCount = intervalPeriod; m_cCallbacks = 0; m_totalCallbacks = 0; m_maxThreadCount = 0; m_dwThreadId = 0; } ~CStreamCallbacksStats() { DeleteCriticalSection( &m_cs ); if ( m_hEvent ) { CloseHandle( m_hEvent ); } } bool Initialize( int numCallbacksBeforeSignal ) { m_signalCount = numCallbacksBeforeSignal; m_hEvent = CreateEvent( NULL, FALSE, FALSE, NULL ); return m_hEvent != NULL; } int getTotalCallbacks() const { return m_totalCallbacks; } static void CUDART_CB countCallbacks( cudaStream_t stream, cudaError_t status, void *userData ); BOOL Wait() { return WaitForSingleObject( m_hEvent, INFINITE ); } int CountProcessThreads( ); } ; int CStreamCallbacksStats::CountProcessThreads( ) { DWORD dwOwnerPID = GetCurrentProcessId(); int ret = 0; int numThreads = 0; HANDLE hThreadSnap = INVALID_HANDLE_VALUE; THREADENTRY32 te32; // Take a snapshot of all running threads hThreadSnap = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); if( hThreadSnap == INVALID_HANDLE_VALUE ) return( FALSE ); // Fill in the size of the structure before using it. te32.dwSize = sizeof(THREADENTRY32 ); // Retrieve information about the first thread, // and exit if unsuccessful if( !Thread32First( hThreadSnap, &te32 ) ) goto Error; numThreads = 0; do { if( te32.th32OwnerProcessID == dwOwnerPID ) { numThreads += 1; } } while( Thread32Next(hThreadSnap, &te32 ) ); EnterCriticalSection( &m_cs ); if ( numThreads > m_maxThreadCount ) { m_maxThreadCount = numThreads; } LeaveCriticalSection( &m_cs ); ret = numThreads; Error: CloseHandle( hThreadSnap ); return ret; } void CUDART_CB CStreamCallbacksStats::countCallbacks( cudaStream_t stream, cudaError_t status, void *userData ) { CStreamCallbacksStats *p = (CStreamCallbacksStats *) userData; const int intervalPeriod = 1000; EnterCriticalSection( &p->m_cs ); DWORD dwThreadId = GetCurrentThreadId(); if ( p->m_dwThreadId == 0 ) { printf( "Initializing thread ID\n" ); p->m_dwThreadId = dwThreadId; } else { if ( p->m_dwThreadId != dwThreadId ) { printf( "Different thread ID\n" ); } } LeaveCriticalSection( &p->m_cs ); if ( 0 == InterlockedDecrement( (LPLONG) &p->m_intervalCount ) ) { InterlockedExchange( (LPLONG) &p->m_intervalCount, intervalPeriod ); p->CountProcessThreads( ); } if ( cudaSuccess != status ) { // confirm that kernel that faulted is reported properly printf( "status = %d\n", status ); return; } if ( p->m_signalCount == InterlockedIncrement( (LONG *) &p->m_totalCallbacks ) ) { printf( "Signaling event\n" ); SetEvent( p->m_hEvent ); } } int main( int argc, char *argv[] ) { cudaError_t status; const int cIterations = 1000; CStreamCallbacksStats stats; stats.Initialize( cIterations ); cuInit(0);// Sleep(1000); printf( "Max threads after cuInit(0): %d\n", stats.CountProcessThreads() ); cudaFree(0); printf( "Max threads after cudaFree(0): %d\n", stats.CountProcessThreads() ); cuda(StreamAddCallback( NULL, CStreamCallbacksStats::countCallbacks, &stats, cudaStreamCallbackNonblocking ) ); printf( "Max threads after cudaStreamAddCallback(): %d\n", stats.CountProcessThreads() ); printf( "Measuring asynchronous launch time (with nonblocking callbacks)... " ); fflush( stdout ); chTimerTimestamp start, stop; chTimerGetTime( &start ); for ( int i = 0; i < cIterations; i++ ) { NullKernel<<<1,1>>>(); cuda(StreamAddCallback( NULL, CStreamCallbacksStats::countCallbacks, &stats, cudaStreamCallbackNonblocking ) ); if ( i == 0 ) { printf( "Max threads: %d\n", stats.CountProcessThreads() ); } } DereferenceNullKernel<<<1,1>>>(); cuda(StreamAddCallback( NULL, CStreamCallbacksStats::countCallbacks, &stats, cudaStreamCallbackNonblocking ) ); cudaDeviceSynchronize(); chTimerGetTime( &stop ); // race condition unless we wait here stats.Wait(); printf( "%d callbacks\n", stats.getTotalCallbacks() ); { double microseconds = 1e6*chTimerElapsedTime( &start, &stop ); double usPerLaunch = microseconds / (float) cIterations; printf( "%.2f us\n", usPerLaunch ); } return 0; Error: printf( "CUDA error: %d (%s)\n", status, cudaGetErrorString( status ) ); return 1; }