-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPeak_Performance.c
146 lines (125 loc) · 6.74 KB
/
Peak_Performance.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <stdio.h>
#include <omp.h>
const int n_trials = 1000000000; // Enough to keep cores busy for a while and observe a steady state
const int flops_per_calc = 2; // Multiply + add = 2 instructions
const int n_chained_fmas = 10; // Must be tuned for architectures here and in blocks (R) and in (E)
const int VECTOR_WIDTH = 8;
int main() {
#pragma omp parallel
{ } // Warm up the threads
double t0 = omp_get_wtime(); // start timer
#pragma omp parallel
{ // Benchmark in all threads
double fa[VECTOR_WIDTH*n_chained_fmas], fb[VECTOR_WIDTH], fc[VECTOR_WIDTH];
fa[VECTOR_WIDTH*n_chained_fmas] = 0.0; // prototype of a memory-based array
fb[VECTOR_WIDTH] = 0.5; // fixed
fc[VECTOR_WIDTH] = 1.0; // fixed
register double *fa01 = fa + 0*VECTOR_WIDTH; // This is block (R)
register double *fa02 = fa + 1*VECTOR_WIDTH; // To tune for a specific architecture,
register double *fa03 = fa + 2*VECTOR_WIDTH; // more or fewer fa* variables
register double *fa04 = fa + 3*VECTOR_WIDTH; // must be used
register double *fa05 = fa + 4*VECTOR_WIDTH;
register double *fa06 = fa + 5*VECTOR_WIDTH;
register double *fa07 = fa + 6*VECTOR_WIDTH;
register double *fa08 = fa + 7*VECTOR_WIDTH;
register double *fa09 = fa + 8*VECTOR_WIDTH;
register double *fa10 = fa + 9*VECTOR_WIDTH;
int i, j;
#pragma omp barrier
#pragma omp single
{ t0 = omp_get_wtime(); }
#pragma nounroll // Prevents automatic unrolling by compiler to avoid skewed benchmarks
for(i = 0; i < n_trials; i++) {
#pragma omp simd simdlen(VECTOR_WIDTH) aligned(fa01,fa02,fa03,fa04,fa05,fa06,fa07,fa08,fa09,fa10,fb,fc) // Ensures that vectorization does occur
for (j = 0; j < VECTOR_WIDTH; j++) { // VECTOR_WIDTH=4 for AVX2, =8 for AVX-512
fa01[j] = fa01[j]*fb[j] + fc[j]; // This is block (E)
fa02[j] = fa02[j]*fb[j] + fc[j]; // To tune for a specific architecture,
fa03[j] = fa03[j]*fb[j] + fc[j]; // more or fewer such FMA constructs
fa04[j] = fa04[j]*fb[j] + fc[j]; // must be used
fa05[j] = fa05[j]*fb[j] + fc[j];
fa06[j] = fa06[j]*fb[j] + fc[j];
fa07[j] = fa07[j]*fb[j] + fc[j];
fa08[j] = fa08[j]*fb[j] + fc[j];
fa09[j] = fa09[j]*fb[j] + fc[j];
fa10[j] = fa10[j]*fb[j] + fc[j];
}
//fa[0:VECTOR_WIDTH*n_chained_fmas] *= 2.0; // Prevent dead code elimination
}
}
const double t1 = omp_get_wtime();
const double gflops = 1.0e-9*(double)VECTOR_WIDTH*(double)n_trials*(double)flops_per_calc*
(double)omp_get_max_threads()*(double)n_chained_fmas;
printf("Chained FMAs=%d, vector width=%d, GFLOPs=%.1f, time=%.6f s, performance=%.1f GFLOP/s\n",
n_chained_fmas, VECTOR_WIDTH, gflops, t1 - t0, gflops/(t1 - t0));
return 0;
}
/*
compile:
//Preprocess
icc -std=c11 -Wall -xHost -xCORE-AVX2 -O2 -qopenmp -qopenmp-simd -qopt-report=5 -qopt-report-phase=all -qopt-report-embed -qopt-report-file=stdout -E ./Peak_Performance.c -o ./Peak_Performance.i
//Compile
icc -std=c11 -Wall -xHost -xCORE-AVX2 -O2 -qopenmp -qopenmp-simd -qopt-report=5 -qopt-report-phase=all -qopt-report-embed -qopt-report-file=stdout -S ./Peak_Performance.i -o ./Peak_Performance.s
//Assemble
icc -std=c11 -Wall -xHost -xCORE-AVX2 -O2 -qopenmp -qopenmp-simd -qopt-report=5 -qopt-report-phase=all -qopt-report-embed -qopt-report-file=stdout -c ./Peak_Performance.s -o ./Peak_Performance.o
//Link
icc -std=c11 -Wall -xHost -xCORE-AVX2 -O2 -qopenmp -qopenmp-simd -qopt-report=5 -qopt-report-phase=all -qopt-report-embed -qopt-report-file=stdout ./Peak_Performance.o -o ./Peak_Performance
//
If you want to test the "FMA" function, you can add "-no-fma" at the end of each command. It generates the executable without "FMA" which has half of GFlops of the previous one.
If you want to test the "SIMD" function, you can use "-qno-openmp-simd" instead of "-qopenmp-simd". It generates the executable without "SIMD" which has one-fourth of GFlops of the previous one.
//
//For AVX512
icc -std=c11 -Wall -xHost -xCore-AVX512 -qopt-zmm-usage=high -O2 -qopenmp -qopenmp-simd -qopt-report=5 -qopt-report-phase=all -qopt-report-embed -qopt-report-file=stdout ./Peak_Performance.c
*/
/*
origin:
https://colfaxresearch.com/skl-avx512/
#include <stdio.h>
#include <omp.h>
const int n_trials = 1000000000; // Enough to keep cores busy for a while and observe a steady state
const int flops_per_calc = 2; // Multiply + add = 2 instructions
const int n_chained_fmas = 10; // Must be tuned for architectures here and in blocks (R) and in (E)
int main() {
#pragma omp parallel
{ } // Warm up the threads
const double t0 = omp_get_wtime(); // start timer
#pragma omp parallel
{ // Benchmark in all threads
double fa[VECTOR_WIDTH*n_chained_fmas], fb[VECTOR_WIDTH], fc[VECTOR_WIDTH];
fa[0:VECTOR_WIDTH*n_chained_fmas] = 0.0; // prototype of a memory-based array
fb[0:VECTOR_WIDTH] = 0.5; // fixed
fc[0:VECTOR_WIDTH] = 1.0; // fixed
register double *fa01 = fa + 0*VECTOR_WIDTH; // This is block (R)
register double *fa02 = fa + 1*VECTOR_WIDTH; // To tune for a specific architecture,
register double *fa03 = fa + 2*VECTOR_WIDTH; // more or fewer fa* variables
register double *fa04 = fa + 3*VECTOR_WIDTH; // must be used
register double *fa05 = fa + 4*VECTOR_WIDTH;
register double *fa06 = fa + 5*VECTOR_WIDTH;
register double *fa07 = fa + 6*VECTOR_WIDTH;
register double *fa08 = fa + 7*VECTOR_WIDTH;
register double *fa09 = fa + 8*VECTOR_WIDTH;
register double *fa10 = fa + 9*VECTOR_WIDTH;
int i, j;
#pragma nounroll // Prevents automatic unrolling by compiler to avoid skewed benchmarks
for(i = 0; i < n_trials; i++)
#pragma omp simd // Ensures that vectorization does occur
for (j = 0; j < VECTOR_WIDTH; j++) { // VECTOR_WIDTH=4 for AVX2, =8 for AVX-512
fa01[j] = fa01[j]*fb[j] + fc[j]; // This is block (E)
fa02[j] = fa02[j]*fb[j] + fc[j]; // To tune for a specific architecture,
fa03[j] = fa03[j]*fb[j] + fc[j]; // more or fewer such FMA constructs
fa04[j] = fa04[j]*fb[j] + fc[j]; // must be used
fa05[j] = fa05[j]*fb[j] + fc[j];
fa06[j] = fa06[j]*fb[j] + fc[j];
fa07[j] = fa07[j]*fb[j] + fc[j];
fa08[j] = fa08[j]*fb[j] + fc[j];
fa09[j] = fa09[j]*fb[j] + fc[j];
fa10[j] = fa10[j]*fb[j] + fc[j];
}
fa[0:VECTOR_WIDTH*n_chained_fmas] *= 2.0; // Prevent dead code elimination
}
const double t1 = omp_get_wtime();
const double gflops = 1.0e-9*(double)VECTOR_WIDTH*(double)n_trials*(double)flops_per_calc*
(double)omp_get_max_threads()*(double)n_chained_fmas;
printf("Chained FMAs=%d, vector width=%d, GFLOPs=%.1f, time=%.6f s, performance=%.1f GFLOP/s\n",
n_chained_fmas, VECTOR_WIDTH, gflops, t1 - t0, gflops/(t1 - t0));
}
*/