You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
4.8 KiB
130 lines
4.8 KiB
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions |
|
* are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of NVIDIA CORPORATION nor the names of its |
|
* contributors may be used to endorse or promote products derived |
|
* from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
*/ |
|
|
|
/** |
|
* Matrix multiplication: C = A * B. |
|
* Host code. |
|
* |
|
* This sample implements matrix multiplication as described in Chapter 3 |
|
* of the programming guide. |
|
* It has been written for clarity of exposition to illustrate various CUDA |
|
* programming principles, not with the goal of providing the most |
|
* performant generic kernel for matrix multiplication. |
|
* |
|
* See also: |
|
* V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," |
|
* in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), |
|
* Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. |
|
*/ |
|
|
|
/** |
|
* Matrix multiplication (CUDA Kernel) on the device: C = A * B |
|
* wA is A's width and wB is B's width |
|
*/ |
|
|
|
#include <cooperative_groups.h> |
|
|
|
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) |
|
{ |
|
// Handle to thread block group |
|
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); |
|
// Block index |
|
int bx = blockIdx.x; |
|
int by = blockIdx.y; |
|
|
|
// Thread index |
|
int tx = threadIdx.x; |
|
int ty = threadIdx.y; |
|
|
|
// Index of the first sub-matrix of A processed by the block |
|
int aBegin = wA * BLOCK_SIZE * by; |
|
|
|
// Index of the last sub-matrix of A processed by the block |
|
int aEnd = aBegin + wA - 1; |
|
|
|
// Step size used to iterate through the sub-matrices of A |
|
int aStep = BLOCK_SIZE; |
|
|
|
// Index of the first sub-matrix of B processed by the block |
|
int bBegin = BLOCK_SIZE * bx; |
|
|
|
// Step size used to iterate through the sub-matrices of B |
|
int bStep = BLOCK_SIZE * wB; |
|
|
|
// Csub is used to store the element of the block sub-matrix |
|
// that is computed by the thread |
|
float Csub = 0; |
|
|
|
// Loop over all the sub-matrices of A and B |
|
// required to compute the block sub-matrix |
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { |
|
// Declaration of the shared memory array As used to |
|
// store the sub-matrix of A |
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; |
|
|
|
// Declaration of the shared memory array Bs used to |
|
// store the sub-matrix of B |
|
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; |
|
|
|
// Load the matrices from device memory |
|
// to shared memory; each thread loads |
|
// one element of each matrix |
|
As[ty][tx] = A[a + wA * ty + tx]; |
|
Bs[ty][tx] = B[b + wB * ty + tx]; |
|
|
|
// Synchronize to make sure the matrices are loaded |
|
cooperative_groups::sync(cta); |
|
|
|
// Multiply the two matrices together; |
|
// each thread computes one element |
|
// of the block sub-matrix |
|
#pragma unroll |
|
for (int k = 0; k < BLOCK_SIZE; ++k) { |
|
Csub += As[ty][k] * Bs[k][tx]; |
|
} |
|
|
|
// Synchronize to make sure that the preceding |
|
// computation is done before loading two new |
|
// sub-matrices of A and B in the next iteration |
|
cooperative_groups::sync(cta); |
|
} |
|
|
|
// Write the block sub-matrix to device memory; |
|
// each thread writes one element |
|
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; |
|
C[c + wB * ty + tx] = Csub; |
|
} |
|
|
|
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB) |
|
{ |
|
matrixMulCUDA<16>(C, A, B, wA, wB); |
|
} |
|
|
|
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB) |
|
{ |
|
matrixMulCUDA<32>(C, A, B, wA, wB); |
|
}
|
|
|