|
7 | 7 | #include <stdlib.h>
|
8 | 8 | #include <inttypes.h>
|
9 | 9 | #include <math.h>
|
10 |
| - |
11 | 10 | #if defined(HAVE_SME)
|
12 | 11 |
|
13 | 12 | /* Function prototypes */
|
@@ -44,15 +43,31 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
|
44 | 43 | m_mod = ceil((double)M/(double)vl_elms) * vl_elms;
|
45 | 44 |
|
46 | 45 | float *A_mod = (float *) malloc(m_mod*K*sizeof(float));
|
47 |
| - |
| 46 | + |
| 47 | + /* Prevent compiler optimization by reading from memory instead |
| 48 | + * of reading directly from vector (z) registers. |
| 49 | + * */ |
| 50 | + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", |
| 51 | + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", |
| 52 | + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", |
| 53 | + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", |
| 54 | + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", |
| 55 | + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); |
| 56 | + |
48 | 57 | /* Pre-process the left matrix to make it suitable for
|
49 | 58 | matrix sum of outer-product calculation
|
50 | 59 | */
|
51 | 60 | sgemm_direct_sme1_preprocess(M, K, A, A_mod);
|
52 | 61 |
|
53 | 62 | /* Calculate C = A*B */
|
54 | 63 | sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);
|
55 |
| - |
| 64 | + |
| 65 | + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", |
| 66 | + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", |
| 67 | + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", |
| 68 | + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", |
| 69 | + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", |
| 70 | + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); |
56 | 71 | free(A_mod);
|
57 | 72 | }
|
58 | 73 |
|
|
0 commit comments