-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatmultKernel00.cu
75 lines (58 loc) · 2.76 KB
/
matmultKernel00.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// matmultKernel00.cu
// For ECE-GY 9143 - High Performance Computing for Machine Learning
// Instructor: Parijat Dubey
// Based on code from the CUDA Programming Guide
// Multiplies two matrices using CUDA: A x B = C
// Copy this file and modify the MatMultKernel device function for
// any of your experiments.
#include "matmultKernel.h"
#define FOOTPRINT_SIZE BLOCK_SIZE
// Define a gpu kernel to perform matrix multiplication
// of A x B = C.
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// matrix blocks
float *Asub, *Bsub, *Csub;
// Putting these into registers speeds access.
int thread_row = threadIdx.y;
int thread_col = threadIdx.x;
int block_row = blockIdx.y;
int block_col = blockIdx.x;
// Each THREAD BLOCK computes one sub matrix Csub of C
// EACH THREAD creates its own matrix descriptor Csub
Csub = &C.elements[C.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * block_col];
// Each thread computes one element of Csub in its copy of CValue
float Cvalue = 0;
// Loop over all sub matrices in block_row of A and block_col of B
// required to compute Csub. Block multiply each pair of sub matrices
// and accumulate results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m){
// Get Asub and Bsub descriptors
Asub = &A.elements[A.stride * BLOCK_SIZE * block_row + BLOCK_SIZE * m];
Bsub = &B.elements[B.stride * BLOCK_SIZE * m + BLOCK_SIZE * block_col];
// Copy ELEMENTS OF ASub and Bsub into shared memory
// EACH THREAD loads ONE ELEMENT of ASub and ONE of Bsub
// Notice: it does not need to be the element it requires to
// compute its Cvalue, as long as all elements are
// collaboratively read.
// Notice: every thread declares shared_A and shared_B in shared memory
// even though a thread block has only one shared_A and one shared_B
__shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE];
// Each thread copies just one element of shared_A and one element of shared_B
shared_A[thread_row][thread_col] = Asub[thread_row * A.stride + thread_col];
shared_B[thread_row][thread_col] = Bsub[thread_row * B.stride + thread_col];
// Synchronize to ensure all elements are read
__syncthreads();
// Do an inproduct of one row of shared_A and one col of shared_B
// computing one Cvalue by accumulation
#pragma unroll
for(int e=0; e<BLOCK_SIZE; ++e)
Cvalue += shared_A[thread_row][e] * shared_B[e][thread_col];
// Synchronize to ensure all Cvalues have been incremented
// before reading in the next shared_A AND shared_B BLOCKS
__syncthreads();
}
// Write Csub to GLOBAL memory.
// Each thread writes its own cell value.
Csub[thread_row * C.stride + thread_col] = Cvalue;
}