-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnoarrmain.hpp
152 lines (125 loc) · 3.91 KB
/
noarrmain.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include <cstdlib>
#include <iostream>
#include <cstdio>
#include <chrono>
#include <cstring>
#include <sstream>
#include <noarr/structures_extended.hpp>
#include <noarr/structures/extra/traverser.hpp>
#ifdef CUDA
#include <noarr/structures/interop/cuda_traverser.cuh>
#define CUCH(status) do { cudaError_t err = status; if (err != cudaSuccess) std::cerr << __FILE__ ":" << __LINE__ << ": error: " << cudaGetErrorString(err) << "\n\t" #status << std::endl, exit(err); } while (false)
#endif
#ifdef LOGGING
#define LOG(log) \
(std::cerr << log << std::endl)
#else
#define LOG(log) ((void)0)
#endif
using num_t = float;
#ifdef POLY
template<class F, std::size_t ...Idxs>
constexpr auto transform_pack(F f, std::index_sequence<Idxs...>) {
return std::integer_sequence<
typename decltype(f(std::integral_constant<std::size_t, 0>()))::value_type,
decltype(f(std::integral_constant<std::size_t, Idxs>()))::value...>();
}
template<std::size_t I, std::size_t J, class C, C ...Idxs>
constexpr auto swap_pack(std::integer_sequence<C, Idxs...>) {
constexpr std::size_t l = std::min(I, J);
constexpr std::size_t h = std::max(I, J);
constexpr C idxs[] = {Idxs...};
return transform_pack([&]<std::size_t X>(std::integral_constant<std::size_t, X>) {
if constexpr(X != l && X != h)
return std::integral_constant<C, idxs[X]>();
else if constexpr(X == l)
return std::integral_constant<C, idxs[h]>();
else
return std::integral_constant<C, idxs[l]>();
}, std::make_index_sequence<sizeof...(Idxs)>());
}
#endif
template<class A, class B, class C>
extern void run_matmul(A ta, B tb, C tc, num_t *pa, num_t *pb, num_t *pc);
int main(int argc, char **argv) {
#ifdef MATRIX_SIZE
if(argc != 2) {
std::cerr << "Usage: PROGRAM FILE" << std::endl;
std::abort();
}
auto i_st = noarr::array<'i', (std::size_t)MATRIX_SIZE>();
auto j_st = noarr::array<'j', (std::size_t)MATRIX_SIZE>();
auto k_st = noarr::array<'k', (std::size_t)MATRIX_SIZE>();
#else
if(argc != 3) {
std::cerr << "Usage: PROGRAM FILE SIZE" << std::endl;
std::abort();
}
std::size_t size;
{
std::istringstream size_stream(argv[2]);
size_stream >> size;
}
auto i_st = noarr::sized_vector<'i'>(size);
auto j_st = noarr::sized_vector<'j'>(size);
auto k_st = noarr::sized_vector<'k'>(size);
#endif
#ifdef A_ROW
auto ta = noarr::scalar<num_t>() ^ i_st ^ k_st;
#else
#ifdef A_COL
auto ta = noarr::scalar<num_t>() ^ k_st ^ i_st;
#else
#error define A_ROW or A_COL
#endif
#endif
#ifdef B_ROW
auto tb = noarr::scalar<num_t>() ^ k_st ^ j_st;
#else
#ifdef B_COL
auto tb = noarr::scalar<num_t>() ^ j_st ^ k_st;
#else
#error define B_ROW or B_COL
#endif
#endif
#ifdef C_ROW
auto tc = noarr::scalar<num_t>() ^ i_st ^ j_st;
#else
#ifdef C_COL
auto tc = noarr::scalar<num_t>() ^ j_st ^ i_st;
#else
#error define C_ROW or C_COL
#endif
#endif
std::size_t a_sz = ta | noarr::get_size();
std::size_t b_sz = tb | noarr::get_size();
std::size_t c_sz = tc | noarr::get_size();
num_t *data;
#ifdef CUDA
CUCH(cudaMallocManaged(&data, a_sz + b_sz + c_sz));
#else
if (!(data = (num_t *)malloc(a_sz + b_sz + c_sz))) {
std::cerr << __FILE__ ":" << __LINE__ << ": error: failed to allocate memory" << std::endl;
exit(1);
}
#endif
std::FILE *file = std::fopen(argv[1], "r");
if(std::fread(data, 1, a_sz + b_sz, file) != a_sz + b_sz) {
std::cerr << "Input error" << std::endl;
std::abort();
}
std::fclose(file);
// run_matmul(ta, tb, tc, data, (data + a_sz / sizeof(num_t)), (data + (a_sz + b_sz) / sizeof(num_t)));
auto start = std::chrono::high_resolution_clock::now();
run_matmul(ta, tb, tc, data, (data + a_sz / sizeof(num_t)), (data + (a_sz + b_sz) / sizeof(num_t)));
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
std::cerr << duration.count() << std::endl;
std::fwrite(data + (a_sz + b_sz) / sizeof(num_t), 1, c_sz, stdout);
#ifdef CUDA
CUCH(cudaFree(data));
#else
free(data);
#endif
return 0;
}