-
Notifications
You must be signed in to change notification settings - Fork 0
/
bitonic.cu
107 lines (82 loc) · 3.29 KB
/
bitonic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include "types.h"
#include "print.h"
#include "helpers.h"
__global__ void bitonic_step(elem* d_arr, size_t size, int k, int j)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x + gridDim.x;
int tmp;
int ij;
int v;
// i is the index of our first element
for (int i = tid; i < size; i += stride) {
// ij is the index of the second element, the one we compare the first with
ij = i ^ j;
// v keeps track of the order we want to sort the i'th chunk of size k,
// if it's 0 it's ascending else it's descending
v = i & k;
// printf("\t\ti = %2d ij = %2d v = %2d\n", i, ij, v); printf("\t\tshould we swap %d with %d?\n", d_arr[i], d_arr[ij]);
// if the element we compare to is after our element ?
if (ij > i) {
// if the elements are not in the correct order that we want
if ((v == 0 && d_arr[i] > d_arr[ij]) || (v != 0 && (d_arr[i] < d_arr[ij]))) {
// swap the elements at index i and ij
tmp = d_arr[i];
d_arr[i] = d_arr[ij];
d_arr[ij] = tmp;
// debug("\t\t\tyes! because v = %d\n", v == 0);
}
}
}
}
struct Result bitonic_sort(elem *unsorted, size_t size, int threads, int blocks)
{
elem *sorted = NULL;
elem *d_sorted = NULL;
elem *d_unsorted = NULL;
// _true will also record the time to copy the data back and forth
float time;
cudaEvent_t start, stop;
cudaErr(cudaEventCreate(&start));
cudaErr(cudaEventCreate(&stop));
printf("Allocating memory for sorted..\n");
sorted = (elem *)malloc(size * sizeof(elem));
if (sorted == NULL) {/*{{{*/
printf("malloc failed at line: %d in file %s\n", __LINE__, __FILE__);
exit(-1);
}/*}}}*/
printf("Allocating memory for d_unsorted..\n");
cudaErr(cudaMalloc((void **)&d_unsorted, size * sizeof(elem)));
printf("Allocating memory for d_sorted..\n");
cudaErr(cudaMalloc((void **)&d_sorted, size * sizeof(elem)));
cudaErr(cudaEventRecord(start));
printf("Sorting..\n");
// copy unsorted to device
cudaErr(cudaMemcpy(d_unsorted, unsorted, size * sizeof(elem), cudaMemcpyHostToDevice));
for (int k = 2; k <= (int)size; k *= 2) { // k is doubled every iteration
debug("k = %d\n", k);
for (int j = k/2; j > 0; j /= 2) { // j is halved at every iteration, with truncation of fractional parts
debug("\tj = %d\n", j);
bitonic_step<<<blocks, threads>>>(d_unsorted, size, k, j);
// #ifdef DEBUG
// cudaErr(cudaMemcpy(sorted, d_unsorted, size * sizeof(elem), cudaMemcpyDeviceToHost));
// print_array(sorted, size, "step");
// #endif
}
}
cudaLastErr();
cudaErr(cudaMemcpy(sorted, d_unsorted, size * sizeof(elem), cudaMemcpyDeviceToHost));
cudaErr(cudaEventRecord(stop));
cudaErr(cudaEventSynchronize(stop));
cudaErr(cudaEventElapsedTime(&time, start, stop));
/* free device memory */
printf("Freeing device memory..\n");
cudaErr(cudaFree((void*)d_unsorted));
cudaErr(cudaFree((void*)d_sorted));
debug("DONE\n");
Result res = { .sorted = sorted, .time = time };
return res;
}