-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmp7.cu
183 lines (156 loc) · 7.25 KB
/
mp7.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#include <wb.h>
#define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while (0)
#define BLOCK_SIZE (256)
/*
NNZ: Number of None-Zero elements
maxRowNNZ: Max NNZ for a row (NNZ for the most dense row)
ndata: Total NNZ
dim: The size of vec and width of the original matrix. The original matrix is square
float matCols[ndata]: Column coordinate in the original matrix. Indexed column-major
float matData[ndata]: Data value in the original matrix. Indexed column-major
int matColStart[maxRowNNZ]: The index of the first element in matCols and matData for each column,
int matRowPerm[dim]: Original row number in CSR for every row in JDS
float vec[dim]: the dense multiplier vector
int matRows[dim]: NNZ/size for every row
*/
// appropriate elements of the JDS data array, the JDS col index array, JDS row index array,
// and the JDS transposed col ptr array to generate one Y element.
__global__ void spmvJDSKernel(float *out, int *matColStart, int *matCols,
int *matRowPerm, int *matRows,
float *matData, float *vec, int dim) {
//@@ insert spmv kernel for jds format
int tx = threadIdx.x;
int bx = blockIdx.x;
int index = bx * BLOCK_SIZE + tx;
int maxNNZ = * matRows; // the first one
int current_row_size = 0;
float sum = 0;
if (index < dim) current_row_size = matRows[index];
for (int i = 0; i < current_row_size; ++i)
{
// cumulate the multiplication
int idx = matColStart[i] + index; // idx in the matData matrix
int col = matCols[idx]; // original
// matCols and matData shares the idx, column based == colstart + original row number
// the index can be applied here since the size of row is sorted in descendantly.
// idx is the allocation of the point in the JDS data layout(column majored)
// lookup the original column number and the value by using the index
sum += vec[col] * matData[idx];
//this lib is intended for coalesc
}
__syncthreads();
if (index < dim) out[matRowPerm[index]] = sum;
return;
}
// spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows,
// deviceJDSData, deviceVector, dim);
// same arguments type and name as the other signature
static void spmvJDS(float *out, int *matColStart, int *matCols,
int *matRowPerm, int *matRows, float *matData,
float *vec, int dim) {
//@@ invoke spmv kernel for jds format
dim3 blockDim(BLOCK_SIZE, 1,1);
dim3 gridDim(ceil( (double)dim / BLOCK_SIZE) ); //
spmvJDSKernel<<<gridDim, blockDim>>>(out, matColStart, matCols, matRowPerm, matRows, matData, vec, dim);
return;
}
int main(int argc, char **argv) {
wbArg_t args;
int *hostCSRCols;
int *hostCSRRows;
float *hostCSRData;
int *hostJDSColStart;
int *hostJDSCols;
int *hostJDSRowPerm;
int *hostJDSRows;
float *hostJDSData;
float *hostVector;
float *hostOutput;
int *deviceJDSColStart; // row pointer**
int *deviceJDSCols; // col_index marker
int *deviceJDSRowPerm; // permutation indices
int *deviceJDSRows; // ? ?
float *deviceJDSData; // data
float *deviceVector; // vector for multiple
float *deviceOutput;
int dim, ncols, nrows, ndata;
int maxRowNNZ;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostCSRCols = (int *)wbImport(wbArg_getInputFile(args, 0), &ncols, "Integer");
hostCSRRows = (int *)wbImport(wbArg_getInputFile(args, 1), &nrows, "Integer");
hostCSRData = (float *)wbImport(wbArg_getInputFile(args, 2), &ndata, "Real");
hostVector = (float *)wbImport(wbArg_getInputFile(args, 3), &dim, "Real");
// output for Y
hostOutput = (float *)malloc(sizeof(float) * dim);
wbTime_stop(Generic, "Importing data and creating memory on host");
/* conversion to JDS format */
CSRToJDS(dim, hostCSRRows, hostCSRCols, hostCSRData, &hostJDSRowPerm, &hostJDSRows,
&hostJDSColStart, &hostJDSCols, &hostJDSData);
maxRowNNZ = hostJDSRows[0]; // Number of None-Zero elements? so the col_start == 0, 13, the first two element make sense, and rest of them are garbages in the mem
// test the converted value
printf("dim: %d, maxRowNNZ: %d \n", dim, maxRowNNZ); // configuration params
printf("\ncol printer for CSR, size %d\n", ncols);
/*
for (int i = 0; i < ncols; ++i)
{
printf("%dth - col: %d \t row: %d \t permrow: %d \t col_start: %d \t data: %f\n", i, * (hostJDSCols + i), *(hostJDSRows + i), *(hostJDSRowPerm + i), *(hostJDSColStart + 1), *(hostJDSData + i));
}
*/
wbTime_start(GPU, "Allocating GPU memory.");
cudaMalloc((void **)&deviceJDSColStart, sizeof(int) * maxRowNNZ);
cudaMalloc((void **)&deviceJDSCols, sizeof(int) * ndata);
cudaMalloc((void **)&deviceJDSRowPerm, sizeof(int) * dim);
cudaMalloc((void **)&deviceJDSRows, sizeof(int) * dim);
cudaMalloc((void **)&deviceJDSData, sizeof(float) * ndata);
cudaMalloc((void **)&deviceVector, sizeof(float) * dim);
cudaMalloc((void **)&deviceOutput, sizeof(float) * dim);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
cudaMemcpy(deviceJDSColStart, hostJDSColStart, sizeof(int) * maxRowNNZ,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceJDSCols, hostJDSCols, sizeof(int) * ndata, cudaMemcpyHostToDevice);
cudaMemcpy(deviceJDSRowPerm, hostJDSRowPerm, sizeof(int) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(deviceJDSRows, hostJDSRows, sizeof(int) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(deviceJDSData, hostJDSData, sizeof(float) * ndata, cudaMemcpyHostToDevice);
cudaMemcpy(deviceVector, hostVector, sizeof(float) * dim, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
wbTime_start(Compute, "Performing CUDA computation");
// sparse matrix multiple vector function call
spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows,
deviceJDSData, deviceVector, dim);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * dim, cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceVector);
cudaFree(deviceOutput);
cudaFree(deviceJDSColStart);
cudaFree(deviceJDSCols);
cudaFree(deviceJDSRowPerm);
cudaFree(deviceJDSRows);
cudaFree(deviceJDSData);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, dim);
free(hostCSRCols);
free(hostCSRRows);
free(hostCSRData);
free(hostVector);
free(hostOutput);
free(hostJDSColStart);
free(hostJDSCols);
free(hostJDSRowPerm);
free(hostJDSRows);
free(hostJDSData);
return 0;
}