-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.cu
71 lines (52 loc) · 1.77 KB
/
utils.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <cstdio>
void random_initial_matrix(float * mat, int N)
{
srand(time(NULL));
for(int i=0;i<N*N;i++)
mat[i] = (float)rand()/RAND_MAX;
}
#define CUDA_KERNEL_CALLER(...) do{\
if(cudaPeekAtLastError() != cudaSuccess){\
printf("A CUDA error occurred prior to the kernel call %s at line %d\n", #__VA_ARGS__, __LINE__); exit(1);\
}\
__VA_ARGS__;\
cudaError_t cuda_ret = cudaPeekAtLastError();\
if(cuda_ret != cudaSuccess){\
printf("CUDA Error at line %d in file %s\n", __LINE__, __FILE__);\
printf(" Error message: %s\n", cudaGetErrorString(cuda_ret));\
printf(" In the kernel call %s\n", #__VA_ARGS__);\
exit(1);\
}\
}while(0)
/*
__global__ void matrixmul(float alpha, float *A, float *B, float beta, float *C, int N)
{
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
const int tile_size = 16;
__shared__ float As[tile_size][tile_size];
__shared__ float Bs[tile_size][tile_size];
int abegin = by * tile_size * N;
int aend = abegin + N -1;
int bbegin = tile_size * bx;
int bstep = tile_size * N;
float Csub = 0.0;
for(int a = abegin, b = bbegin; a<=aend; a += tile_size, b += bstep)
{
As[ty][tx] = A[ a + N * ty + tx];
//Bs[tx][ty] = B[b + N*tx + ty ];
Bs[ty][tx] = B[b + N*ty + tx];
__syncthreads();
for(int k=0;k<tile_size;k++)
{
Csub += As[ty][k] * Bs[k][tx];
// Csub += As[ty][k] * Bs[tx][k];
}
__syncthreads();
}
// int c = abegin + bbegin + N * ty + tx;
// C[c] = Csub;
int id = abegin + bbegin + N * ty + tx;
C[id] = Csub * alpha + C[id]*beta;
}
*/