-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatmult.cu
210 lines (167 loc) · 5.57 KB
/
matmult.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// matmult.cu
// For ECE-GY 9143 - High Performance Computing for Machine Learning
// Instructor: Prof. Parijat Dubey
// Based on code from the CUDA Programming Guide
// DO NOT MODIFY FOR THE ASSIGNMENT
// Includes
#include <stdio.h>
#include "timer.h"
#include "matmultKernel.h"
// Defines
#define epsilon (float)1e-3
#define verbose 0
Matrix MakeDeviceMatrix(Matrix M, bool copy){
// Create a new matrix in device memory.
Matrix newDeviceMatrix;
newDeviceMatrix.width = M.width;
newDeviceMatrix.stride = M.width;
newDeviceMatrix.height = M.height;
size_t size = M.width * M.height * sizeof(float);
cudaMalloc((void**) &newDeviceMatrix.elements, size);
if (copy)
cudaMemcpy(newDeviceMatrix.elements, M.elements, size, cudaMemcpyHostToDevice);
return newDeviceMatrix;
}
// Host code for matrix multiplication.
// Matrix dimensions must be multiples of size
// This code assumes that the matrix is square.
void MatMul(const Matrix A, const Matrix B, Matrix C, int dimension){
// Create device data structures.
Matrix device_A = MakeDeviceMatrix(A, true);
Matrix device_B = MakeDeviceMatrix(B, true);
Matrix device_C = MakeDeviceMatrix(C, false);
// Define grid topology
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimension, A.height/dimension);
// Invoke kernel for warm up
MatMulKernel<<<dimGrid, dimBlock>>>(device_A, device_B, device_C);
// Synchronize to make sure everyone is done in the warmup.
cudaThreadSynchronize();
// Set up timer
initialize_timer();
start_timer();
// Invoke kernel for real
MatMulKernel<<<dimGrid, dimBlock>>>(device_A, device_B, device_C);
// Synchronize to make sure everyone is done.
cudaThreadSynchronize() ;
// Compute and report the timing results
stop_timer();
double time = elapsed_time();
double nFlops = (double)A.width*A.height*B.width*2;
double nFlopsPerSec = nFlops/time;
double nGFlopsPerSec = nFlopsPerSec*1e-9;
printf( "Data dimensions: %dx%d \n", C.height, C.width);
printf( "Grid Dimensions: %dx%d \n",dimGrid.x,dimGrid.y);
printf( "Block Dimensions: %dx%d \n",dimBlock.x,dimBlock.y);
printf( "Footprint Dimensions: %dx%d \n",FOOTPRINT_SIZE,FOOTPRINT_SIZE);
printf( "Time: %lf (sec), nFlops: %0.0lf, GFlopsS: %lf\n",
time, nFlops, nGFlopsPerSec);
// Copy the result to the host memory from device memory
size_t size = C.width * C.height * sizeof(float);
cudaMemcpy(C.elements, device_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(device_A.elements);
cudaFree(device_B.elements);
cudaFree(device_C.elements);
}
// Create a matrix in host memory.
Matrix MakeHostMatrix(int width, int height){
Matrix newHostMatrix;
newHostMatrix.width = width;
newHostMatrix.height = height;
size_t size = newHostMatrix.width * newHostMatrix.height * sizeof(float);
newHostMatrix.elements = (float*)malloc(size);
return newHostMatrix;
}
// Print a matrix stored in host memory.
void printMatrix(Matrix M, const char* name) {
printf("\n%s \n",name);
for(int y=0; y<M.height; y++){
for(int x=0; x<M.width; x++) {
printf("%f ", M.elements[y * M.width + x]);
}
printf("\n");
}
}
// Initialize dummy data in a matrix stored in host memory.
void initMatrix(Matrix M, bool horizontal) {
for(int y=0; y<M.height; y++) {
for(int x=0; x<M.width; x++) {
M.elements[y*M.width+x] = (float)(horizontal?x:y);
}
}
}
// Check the specified matrix to be sure it is correct.
// That is, make sure it is the result of multiplying the
// dummy data we created earlier.
void checkResult(Matrix M) {
Matrix correct = MakeHostMatrix(M.width, M.height);
for(int y=0; y<M.height; y++) {
for(int x=0; x<M.width; x++) {
correct.elements[y*correct.width+x] = (float)M.width*(float)x*y;
}
}
if(verbose){
// print correct
printMatrix(correct, "correct");
// print host_C
printMatrix(M, "result");
}
double maxerror = 0.0;
int errCnt = 0;
for(int y=0; y<correct.height; y++) {
for(int x=0; x<correct.width; x++) {
float it = correct.elements[y*correct.width+x];
if(fabs(it - M.elements[y*M.width+x])> epsilon*it) {
errCnt++;
double error = fabs(it - M.elements[y*M.width+x])/it;
if (error > maxerror) maxerror = error;
}
}
}
if(errCnt>0){
printf("\n\nTEST FAILED: number of errors: %d, max rel error: %f\n", errCnt, maxerror);
}
free(correct.elements);
}
//
// main
//
int main(int argc, char** argv) {
// Grid dimension
int num_blocks;
// Matrix dimensions in multiples of FOOTPRINT_SIZE
// Matrices will be of size data_size * data_size
int data_size;
// Read command line argument
if(argc == 2){
sscanf(argv[1], "%d", &num_blocks);
data_size = num_blocks * FOOTPRINT_SIZE;
} else {
printf("Usage: %s NumBlocks\n", argv[0]);
exit(0);
}
// Create matrices in host.
Matrix host_A = MakeHostMatrix(data_size, data_size);
Matrix host_B = MakeHostMatrix(data_size, data_size);
Matrix host_C = MakeHostMatrix(data_size, data_size);
// Initialize values in host A and B
initMatrix(host_A,false);
initMatrix(host_B,true);
// debugging
if(verbose){
printMatrix(host_A, "host_A");
printMatrix(host_B, "host_B");
}
// Perform CUDA matrix Multiplication
// MatMul is a host function that calls
// the device kernel MatMulKernel and
// times its performance.
MatMul(host_A,host_B,host_C,FOOTPRINT_SIZE);
// Verify that the result is correct.
checkResult(host_C);
// Free allocated memory.
free(host_A.elements);
free(host_B.elements);
free(host_C.elements);
}