-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmp5.1.cu
144 lines (116 loc) · 4.63 KB
/
mp5.1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
// MP Reduction
// Given a list (lst) of length n
// Output its sum = lst[0] + lst[1] + ... + lst[n-1];
#include <wb.h>
#define BLOCK_SIZE 512 //@@ You can change this
#define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while (0)
__global__ void total(float *input, float *output, int len) {
//@@ Load a segment of the input vector into shared memory
//@@ Traverse the reduction tree
//@@ Write the computed sum of the block to the output vector at the
//@@ correct index
__shared__ float Ms[BLOCK_SIZE * 2];
int tx = threadIdx.x;
int bx = blockDim.x * blockIdx.x;
// original sequence
// len = 256
int start = bx * 2 + tx * 2;
Ms[2 * tx] = start < len ? input[start] : 0;
Ms[2 * tx + 1] = (start + 1) < len ? input[start + 1] : 0;
// iterate
unsigned int stride = BLOCK_SIZE;
for (; stride >= 1; stride /= 2)
{
__syncthreads();
/*if (stride == 512 && tx == 0)
{
printf("len %d \n", len);
for (int i = 0; i < 512; ++i)
printf("%dth: %f\n", Ms[i]);
}*/
if (tx < stride)
{
Ms[tx] = Ms[tx] + Ms[tx + stride];
}
}
if (tx == 0)
{
printf("the value: %f", Ms[0]);
output[blockIdx.x] = Ms[0];
}
return;
}
int main(int argc, char **argv) {
int ii;
wbArg_t args;
float *hostInput; // The input 1D list
float *hostOutput; // The output list
float *deviceInput;
float *deviceOutput;
int numInputElements; // number of elements in the input list
int numOutputElements; // number of elements in the output list
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput =
(float *)wbImport(wbArg_getInputFile(args, 0), &numInputElements);
// BLOCK_SIZE * 2
numOutputElements = numInputElements / (BLOCK_SIZE << 1);
if (numInputElements % (BLOCK_SIZE << 1)) {
numOutputElements++;
}
hostOutput = (float *)malloc(numOutputElements * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ",
numInputElements);
wbLog(TRACE, "The number of output elements in the input is ",
numOutputElements);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void **)&deviceInput, sizeof(float) * numInputElements);
cudaMalloc((void **)&deviceOutput, sizeof(float) * numOutputElements);
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy(deviceInput, hostInput, sizeof(float) * numInputElements, cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 blockDim(BLOCK_SIZE, 1, 1);
dim3 gridDim(numOutputElements, 1, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
total<<<gridDim, blockDim>>>(deviceInput, deviceOutput, numInputElements);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
wbTime_stop(Copy, "Copying output memory to the CPU");
/********************************************************************
* Reduce output vector on the host
* NOTE: One could also perform the reduction of the output vector
* recursively and support any size input. For simplicity, we do not
* require that for this lab.
********************************************************************/
printf("value: %f for main \n", hostOutput[0]);
for (ii = 1; ii < numOutputElements; ii++) {
printf("result: %dth %f\n", ii, hostOutput[ii]);
hostOutput[0] += hostOutput[ii];
}
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceOutput);
cudaFree(deviceInput);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, 1);
free(hostInput);
free(hostOutput);
return 0;
}