@@ -110,9 +110,9 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
110
110
#if defined(__GNUG__) && !defined(__clang__)
111
111
112
112
// inline asm helper
113
- #define VSX_IMPL_1RG (rt, rto, rg, rgo , opc, fnm ) \
114
- VSX_FINLINE (rt) fnm(const rg& a) \
115
- { rt rs; __asm__ __volatile__ (#opc" %x0,%x1" : " =" # rto (rs) : # rgo (a)); return rs; }
113
+ #define VSX_IMPL_1RG (rt, rg , opc, fnm ) \
114
+ VSX_FINLINE (rt) fnm(const rg& a) \
115
+ { rt rs; __asm__ __volatile__ (#opc" %x0,%x1" : " =wa " (rs) : " wa " (a)); return rs; }
116
116
117
117
#define VSX_IMPL_1VRG (rt, rg, opc, fnm ) \
118
118
VSX_FINLINE (rt) fnm(const rg& a) \
@@ -233,6 +233,10 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
233
233
#if __GNUG__ < 5
234
234
// vec_xxpermdi in gcc4 missing little-endian supports just like clang
235
235
# define vec_permi (a, b, c ) vec_xxpermdi(b, a, (3 ^ (((c) & 1 ) << 1 | (c) >> 1 )))
236
+ // same as vec_xxpermdi
237
+ # undef vec_vbpermq
238
+ VSX_IMPL_2VRG (vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
239
+ VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
236
240
#else
237
241
# define vec_permi vec_xxpermdi
238
242
#endif // __GNUG__ < 5
@@ -257,44 +261,38 @@ VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
257
261
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
258
262
259
263
// converts word and doubleword to double-precision
260
- #ifdef vec_ctd
261
- # undef vec_ctd
262
- #endif
263
- VSX_IMPL_1RG (vec_double2, wd, vec_int4, wa, xvcvsxwdp, vec_ctdo)
264
- VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, vec_ctdo)
265
- VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, vec_ctd)
266
- VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
264
+ #undef vec_ctd
265
+ VSX_IMPL_1RG (vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
266
+ VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
267
+ VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
268
+ VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
267
269
268
270
// converts word and doubleword to single-precision
269
271
#undef vec_ctf
270
- VSX_IMPL_1RG (vec_float4, wf, vec_int4, wa, xvcvsxwsp, vec_ctf)
271
- VSX_IMPL_1RG(vec_float4, wf, vec_uint4, wa, xvcvuxwsp, vec_ctf)
272
- VSX_IMPL_1RG(vec_float4, wf, vec_dword2, wi, xvcvsxdsp, vec_ctfo)
273
- VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi , xvcvuxdsp, vec_ctfo)
272
+ VSX_IMPL_1RG (vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
273
+ VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
274
+ VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
275
+ VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
274
276
275
277
// converts single and double precision to signed word
276
278
#undef vec_cts
277
- VSX_IMPL_1RG (vec_int4, wa, vec_double2, wd , xvcvdpsxws, vec_ctso)
278
- VSX_IMPL_1RG(vec_int4, wa, vec_float4, wf, xvcvspsxws, vec_cts)
279
+ VSX_IMPL_1RG (vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
280
+ VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
279
281
280
282
// converts single and double precision to unsigned word
281
283
#undef vec_ctu
282
- VSX_IMPL_1RG (vec_uint4, wa, vec_double2, wd , xvcvdpuxws, vec_ctuo)
283
- VSX_IMPL_1RG(vec_uint4, wa, vec_float4, wf, xvcvspuxws, vec_ctu)
284
+ VSX_IMPL_1RG (vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
285
+ VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
284
286
285
287
// converts single and double precision to signed doubleword
286
- #ifdef vec_ctsl
287
- # undef vec_ctsl
288
- #endif
289
- VSX_IMPL_1RG (vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
290
- VSX_IMPL_1RG(vec_dword2, wi, vec_float4, wf, xvcvspsxds, vec_ctslo)
288
+ #undef vec_ctsl
289
+ VSX_IMPL_1RG (vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
290
+ VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
291
291
292
292
// converts single and double precision to unsigned doubleword
293
- #ifdef vec_ctul
294
- # undef vec_ctul
295
- #endif
296
- VSX_IMPL_1RG (vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
297
- VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
293
+ #undef vec_ctul
294
+ VSX_IMPL_1RG (vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
295
+ VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
298
296
299
297
// just in case if GCC doesn't define it
300
298
#ifndef vec_xl
0 commit comments