From b1caca297f0495371edaaeaafe13754c25fa62e8 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sun, 27 Jul 2025 18:08:37 +0200 Subject: [PATCH 1/2] Squashed 'lib/simde/simde/' changes from cbef1c152..0faa907b2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0faa907b2 gcc pedantic: fp16 is not part of ISO C, silence the warning f53a9cf79 gcc pedantic: also silence this other warning about __int128 59f779845 arm neon: Add float16 multi-vectors to native aliases 4b279d62e https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100927 was fixed in GCC 15.x 5c8f50ec1 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95782 was fixed in GCC 13 677f2cbee Avoid undefined behaviour with signed integer multiplication (#1296) 2096f755e arm64 gcc FRINT: skip native call on GCC 3ea330475 x86 sse2 for loongarch: fix GCC build failure (#1287) a532a12ca riscv64: Fallback to autovec without mrvv-vector-bits flag. (#1282) 85632ca82 arm neon riscv64: add min.h and max.h RVV implementations. (#1283) ca1e942d9 neon riscv64: Enable RVV segment load/store only when we have `__riscv_zvlsseg` flag. (#1285) cf8e6a73d riscv64: Enable V feature when both zve64d and zvl128b are present (#1284) c7f26b73b x86 avx for loongarch: use vfcmp_clt to save one instruction in `_mm_cmp_{sd,ss}` and `_mm256_cmp_pd` a8ae10d96 x86 sse2,avx2 loongarch impl: let compiler to generate instructions based on imm8 bb0282e3b x86 misc fixes for AVX512{F,VL}_NATIVE d458d8fdd x86 sse2,sse3, avx: silence some false-positive warnings about unitialized structs 4184e0d42 start preparing to release SIMDe 0.8.4 87ecd64a5 x86 sse2: fix overflow error detected by clang scan-build in simde_mm_srl_epi{16,32,64} when count is too high ca9449c1e Fix incorrect UQRSHL implementation. 8d90b0411 arm neon: fix `cmla{_rot{90,180,270},}_lane` with correct test-suite on ARMv8.3 system 500454a2a arm neon: replace use of SIMDE_ARCH_ARM_CHECK(8+) with feature checks. 02ba92220 arm neon gcc-12 FRINT workaround 02d815773 arm neon FCMLA with 16-bit floats, requires the FP16 feature 8caaee795 arm neon: FRINT{32,64}{X,Z} native calls require ARMv8.5 438ddcff6 remove extraneous semicolons from many macro-defined functions 9f73373ff wasm simd128: fix a FAST_NANS error on arm64 0bd19a993 Fix vqdmulhs_s32 native alias. 62f40d4b8 x86 avx2: small fixes for loongarch d656b4d7e x86 sse2: small fixes for loongarch 8f56d4ff1 Remove incorrect qrdmulh SSE code. 8c421df17 arm neon: define native alias only under the inverse of the conditions of a pass-through 25e70ce71 simde-aes: gcc 13.2+ ignore unused variable warnings 69c9cd5c3 arm neon qdmlal: fix saturation (#1194) 34136823c Fix vqshlud_n_s64 implementation to be 64-bit. 483a4bccf Fix qdmlsl instructions f275fffd9 arm neon qshl: Fix UQSHL to match hardware. Add extensive test vectors. (#1256) d95bd9d76 arm neon qdmull: Fix SQDMULL implementation for 32-bit inputs. (#1255) 4b9007046 x86 sse2: fix `_mm_pause` for RISCV systems 0be41ec7c risc64 gcc-14: Disable uninitialized variable warnings for some ARM neon SM3 functions 70fc574b2 arm: Rename ARM ROL/ROR functions with a SIMDE prefix. a39bd6dde arm neon sli_n: Fix invalid shift warnings (#1253) 7bd2bb70e arm neon `_vext_p6`: reverse logic to avoid GCC14 i586 bug (#1251) b4bf72e14 x86 clmul simde_x_bitreverse_u64: add loongarch implementation (#1249) 04f9b4ca6 x86 avx: reoptimized simde_mm256_addsub_ps/d with lasx 54d352981 x86 fma: add loongarch lasx optimized implementations adefb8dcb x86 f16c: add loongarch lasx optimized implementations b720dcb7d x86 avx512f: added fmaddsub implementation (#1246) 5c9f6aa19 x86 sse4.2: add loongarch lsx optimized implementations 783703714 x86 sse4.1: add loongarch lsx optimized implementations 0bfc2312f x86 ssse3: add loongarch lsx optimized implementations fcae0eee0 x86 sse3: add loongarch lsx optimized implementations af6467260 x86 sse: add loongarch lsx optimized implementations 2ad64c9f7 x86 avx2: add loongarch lasx optimized implementations (#1241) 5cae2261b x86 avx: add loongarch lasx optimized implementations (#1239) 484fcce25 x86 avx: use INT64_C when the destination is i64 (#1238) 5e225b1c6 loongarch: add lsx support for sse2.h 665d7f93b fix clang type redef error b0fcc6176 Whoops, missing comma fe262fb0e loongarch float16: use a portable version to avoid compilation errors 1a09d3bc9 x86: move definition of 'value' to correct branch in _mm_loadl_epi64 aac583326 x86: some better implementations for MSVC and others without SIMDE_STATEMENT_EXPR_ d1afb3db1 arm crc32: define SIMDE_ARCH_ARM_CRC32 and consistently use it 592f8f0c4 _mm256_storeu_pd and _mm256_loadu_pd using 128 bit lanes de4337e8d gcc-14 -O3 complained about some possible unitialized values 8b0937a3e neon/cvz z/Arch: stop using deprecated functions. e18dcd7d0 arm neon: avoid GCC 11 vst1_*_x4 built-in functions 848fb7777 arm neon: fix arm64 gcc11 build excess elements in vector failure 0aaf78298 x86/sse: Fix type convert error for LSX. 29c96207c arm wasm: add vst2_u8 translation to Wasm SIMD 375ad48fd arm wasm: add vshll translations to Wasm SIMD d5697fa99 arm wasm: add vst4_u8 translation to Wasm SIMD e235b2eb1 math: typo fix, check SIMDE_MATH_NANF instead of the old-style SIMDE_NANF cb4b08c47 wasm AltiVec: add u16x8 and u8x16 avgr translations 90237caba wasm NEON: add u16x8 and u8x16 avgr translations 6050906e9 arm neon vminnmv_f16: remove duplicate statement (#1208) a3d20d145 x86 wasm: Wasm SIMD version of `_mm_sad_epu8` 32650204e msvc: add simde_MemoryBarrier to avoid including 7ca5a3e0b x86/fma: Use 128 bit fnmadd_pd to do 256 bit fnmadd_pd (#1197) 2ec1f51f8 pow: consistently use simde_math_pow 80f655739 x86: remove redundant mm_add_pd translation for WASM (#1190) 249b9dc03 arm/neon riscv64: additional RVV implementations - part 2. (#1189) 408d06a35 arm/neon riscv64: additional RVV implementations - part1 (#1188) da5cf1f54 Use _Float16 in C++ on aarch64 with GCC 13+ 39f436a9e Don't use _Float16 on non-SSE2 x86 985c27100 Don't use _Float16 on s390x 787830467 x86: Apply half tabular method in _mm_crc32 family d8a0c764f arm: improve performance in vqadd and vmvn in risc-v 99c63a427 neon: avoid warnings when "__ARM_NEON_FP" is not defined. e98cbcc70 start next development cycle: v0.8.3 3442dbf2d prepare to release 0.8.0 e6afb7bec arm neon: Fully remove the problematic FCVTZS/FCVTMS/FCVTPS/FCVTNS family intrinsics fb73a3182 arm: improve performance in vabd_xxx for risc-v 8a4ff7a8b arm: improve performance in vhadd_xxx for risc-v 52f1087ad arm: Add neon2rvv support in vand series intrinsics 737e3b33f arm: fix some neon2rvv intrinsic function error 5242a77dc arm: enable more intrinsic function for armv7 8f123e5c0 wasm x86 impl: some were incorrectly marked SSE instead of SSE2 2b9b01269 arm x86 implementations: allow _m128 access from SSE 6679ff018 svml: SSE is good enough for native m128i and m128d types & functions 68aac3b9a sse2 MSVC `_mm_pause` implementaiton for x86 e76f4331e typo fixes from codespell 73160356b x86 xop: fix some native functions 4ecf271be emscripten; use `__builtin_roundeven{f,}` from version 3.1.43 onwards 347e2b699 arm 32 bits: native def fixes; workarounds for gcc 61d1addce apple clang arm64: ignore SHA2 b58359225 arm platform: cleanup feature detection. e38f25685 arm neon sm3: check constant range ac2b229a1 arm neon: disable some FCVTZS/FCVTMS/FCVTPS/FCVTNS family intrinsics 1d7848cf9 arm neon clang: skip vrnd native before clang v18 bb11054b5 clang: detect versions 18 & 19 647bb87de Initial Support for the RISC-V Vector Extension in ARM NEON (#1130) 83479bd70 start next development cycle: v0.8.1 22a493c26 arm/neon abs: negating INT_MIN is undefined behavior 453dec209 simde-detect-clang.h: add clang 17 detection (#1132) e6fab1296 Update simde-detect-clang.h (#1131) e29a4fab5 typo: XCode -> Xcode (#1129) 8392c69a1 Improve performance of simde_mm512_add_epi32 (#1126) ddaab3759 neon {u,s}addh apply arm64 windows workaround only on msvc<1938 (#1121) 8e9d432a6 correction of simde_mm256_sign_epi{8,16,32}. (#1123) 43ec909bb avx512 abs: refine GCC compiler checks for `_mm512{,_mask}_abs_pd` (#1118) 24be11d00 gh-actions: test mips64el using qemu on gcc12/clang16 f0bd155cf wasm relaxed: add f{32x4,64x2}_relaxed_{min,max} 459abf9f7 wasm simd128/relaxed: begin MIPS implementations ffe050ce9 wasm relaxed: updated names; reordered FMA operations 762d7ad22 wasm: detect support for Relaxed SIMD mode e96949e3f prepare to release 0.8.0 f73d72e4e NEON: implement all bf16-related intrinsics (#1110) 72f6d30fe neon: add enable vmlaq_laneq_f32 and vcvtq_n_f64_u64 d6271b3fe NEON: implement all intrinsics supported by architecture A64-remaining part (#1093) 7904fc3cf sse2 mm_pause: more archs, add a basic test 260adca59 arm neon ld2: silence warnings at -O3 on gcc risc-v d1578a0ce simde_float16: prefer __fp16 if available 064b80493 svml: don't enable SIMDE_X86_SVML_NATIVE for ClangCl 790e8d6c3 fp16: don't use _Float16 on ClangCL if not supported 87f7d3317 neon: Modified simde_float16 to simde_float16_t (#1100) 2c58d6d05 Reuse unoptimized implementations of vaesimcq_u8 from x86 be7e377cb [NEON] Add AES instructions. a686efde4 x86 sse4.1 mm_testz_si128: fix backwards short circuit logic bc206e4fa wasm f{32x4,64x2}_min: add workaround for a gcc<6 issue f2e82c961 x86 pclmul: fix natives, some require VPCLMULQDQ 0adef454b avx512 gather: add MSVC native fallbacks ecc469297 avx512 set: add simde_x_mm512_set_m256{,d} 70f702627 NEON: part 1 of implement all intrinsics supported by architecture A64 (#1090) aefb342e9 avx512 types: avoid using native AVX512 types on MSVC unless required 833a87750 svml: enable SIMDE_X86_SVML_NATIVE for MSVC 2019+ db326c75c sse{,2,4.1}, avx{,2} *_stream_{,load}: use __builtin_nontemporal_{load,store} 3f0321ba2 sse _mm_movemask_ps: remove unused code 4c7c77217 gh-actions: test with clang-16 87cf105ab neon/st1{,q}_*_x{2,3,4}: initial implementation (#1082) 5634cec09 NEON: more fp16 using intrinsics supported by architecture v7 (skip version) (#1081) cfd917230 arm neon: Complex operations from Armv8.3-a (#1077) 389f360a6 arm aes: add neon implementation using the crypto extension 4adb6591f arm: use SIMDE_ARCH_ARM_FMA faeb00a70 avx512: fix many native aliases 98eb64b85 sse: implement _mm_movelh_ps for Arm64 57197e8db aes: initial implementation of most aes instructions (#1072) 2fbc63391 NEON: Implement some f16XN types and f16 related intrinsics. (#1071) 64f94c681 avx: simde_mm256_shuffle_pd fix for natural vector size < 128 d41408997 Add workaround for GCC bug 111609 665676042 Extend constant range in simde_vshll_n_XXX intrinsics (#1064) 33c4480de Remove non-working MMX specialization from simde_vmin_s16 6f4afd634 Fix issues related to MXCSR register (#1060) 82be3395b fix SIMDE_ARCH_X86_SSE4_2 define 38580983e riscv64 clang: doesn't support _Float16 or __fp16 properly yet a39f2c3b3 avx512/shuffle: mm512_{shuffle_epi32,shuffle{hi,lo}_epi16} a202d0116 avx512/gather: mm512_{mask_,}i64gather_{epi32,epi64,ps,pd) 95a6d0813 avx512 new families started: gather/reduce + other additional funcs ef8931287 avx512 cmp,cvt,cvts,cvtt,cvtus,gather,kand,permutex,rcp: new ops for intgemm 4a29d21ff avx512: start supporting AVX512FP16 / m512h f686d38f1 clang wasm: SIMDE_BUG_CLANG_60655 is fixed in the upcoming 17.0 release 7760aabd1 GCC AVX512F: SIMDE_BUG_GCC_95399 was fixed in GCC 9.5, 10.4, 11.4, 12+ 436dd4cc1 GCC x86/x64: SIMDE_BUG_GCC_98521 was fixed in 10.3 843112308 GCC x86: SIMDE_BUG_GCC_94482 was fixed in 8.5, 9.4, 10+ e140ac4e2 x86/avx512 fpclass: improve fallback implementation 5950c402c gh-actions: re-order ccache; add old clang/gcc versions faf228937 avx512/loadu: fix native detection b3341922b simde-f16: improve _Float16 usage; better INFHF/NANHF defs 5e632b09d avx512: naive implementation of fpclass b71b58c27 [NEON/A32V7]: Don't trust clang for load multiple on A32V7 c5de4d090 neon: Add qtbl/qtbx polyfills for A32V7 3bda0d7c6 neon/cvtn: vcvtnq_u32_f32 is a V8 function 73910b60c msa neon impl: float64x2_t is not avail in A32V7 0540d7fc2 clang aarch64: optimization bug 45541 was fixed in clang-15 d315aac71 clmul: aarch64 clang has difficulties with poly64x1_t a2eeb9ef1 sse4.1: use logical OR instead of bitwise OR in neon impl of _mm_testnzc_si128 e676d9982 clang powerpc: vec_bperm bug was fixed in clang-14 0e3290e86 neon/st1: disable last remaining AltiVec implementation db0649e1d wasm simd128: more powerpc fixes bbdb2a1f5 sse2,wasm simd128: skip SIMDE_CONVERT_VECTOR_ impementations on PowerPC c6b6ac500 wasm/simd128: add missing unsigned functions 78faeab11 wasm/simd128: fix altivec_p7 version of wasm_f64x2_pmin 1f359106b We are in a dev period again: v0.7.7 9135bd049 neon/cvtn: vcvtnq_{s32_f32,s64_f64}: add SSE & AVX512 optimized implementations 6a1db3a5a neon/cvtn: basic implementation of a few functions 1cf65cb0a mmx: loogson impl promotions over SIMDE_SHUFFLE_VECTOR_ 4ab8749df sse{,2,3,4.1},avx: more WASM shuffle implementations b49fa29d5 avx512: arghhh: really fix typedef of __mmask64 6244ab92e avx512: typo fix for typedef of __mmask64 20c5200d6 avx512/madd: fix native alias arguments for _mm512_madd_epi16 cc476f364 neon/qabs: restore SSE2 impl for vqabsq_s8 a7682611d neon/abd,ext,cmla{,_rot{180,270,90}}: additional wasm128 implementations ca523adb7 sse: allow native _mm_loadh_pi on MSVC x64 ac526659e test: appease GCC 5.x & clang 01ea9a8d3 start release process for 0.7.6 28a6001f6 x86/sse*,avx: add additional SIMD128 implementations aca2f0ae6 neon/shl,rshl: fix avx include to unbreak amalgamated hearders f60a9d8df neon/mla_lane: initial implementation using mla+dup f982cfd51 Update clang version detection for 14..16 and add link b45a14ccc simde-arch: include hedley for setting F16C for MSVC 2022+ with AVX2 3ce91d4cd 0.7.5 dev cycle on the road to 0.7.6/0.8.0 02c7a67ed sse: remove unbalanced HEDLEY_DIAGNOSTIC_PUSH b0b370a4b x86/sse: Add LoongArch LSX support 2338f175d arch: Add LoongArch LASX/LSX support 90d95fae4 avx512: define __mask64 & __mask32 if not yet defined 42a43fa57 sve/true,whilelt,cmplt,ld1,st1,sel,and: skip AVX512 native implementations on MSVC 2017 20f98da6f sve/whilelt: correct type-o in __mmask32 initialization 47a1500f7 sve/ptest: _BitScanForward64 and __builtin_ctzll is not available in MSVC 2017 cd93fcc9e avx512/knot,kxor: native calls not availabe on MSVC 2017 ba6324b6b avx512/loadu: _mm{,256}_loadu_epi{8,16,32,64} skip native impl on MSVC < 2019 2f6fe9c64 sse2/avx: move some native aliases around to satisfy MSVC 2017 /ARCH:AVX512 91fda2cc9 axv512/insert: unroll SIMDE_CONSTIFY for testing macro implemented functions a397b74b3 __builtin_signbit: add cast to double for old Clang versions e016050b2 clmul: _mm512_clmulepi64_epi128 implicitly requires AVX512F 7e353c009 Wasm q15mulr_sat_s: match Wasm spec ce375861c Wasm f32/f64 nearest: match Wasm spec 96d5e0346 Wasm f32/f64 floor/ceil/trunc/sqrt: match Wasm spec 5676a1ba7 Wasm f32/f64 abs: match Wasm spec aa299c08b Wasm f32/f64 max: match Wasm spec 433d2b951 Wasm f32/f64 min: match Wasm spec cf1ac40b8 avx{,2}: some intrinsics are missing from older MSVC versions bff9b1b3c simd128: move unary minus to appease msvc native arm64 efc512a49 neon/ext: unroll SIMDE_CONSTIFY for testing macro implemented functions 091250e81 neon/addlv: disable SSSE3 impl of _vaddlvq_s16 for MSVC 4b3053606 neon/ext: simde_*{to,from}_m64 reqs MMX_NATIVE 2dedbd9bf skip many mm{,_mask,_maskz}_roundscale_round_{ss,sd} testing on MSVC + AVX a04ea7bc9 f16c: rounding not yet implemented for simde_mm{256,}_cvtps_ph e8ee041ab ci appveyor: build tests with AVX{,2}, but don't run them 2188c9728 arm/neon/add{l,}v: SSE2/SSSE3 opts _vadd{lvq_s8, lvq_s16, lvq_u8, vq_u8} 186f12f17 axv512: add simde_mm512_{cvtepi32_ps,extractf32x8_ps,_cmpgt_epi16_mask} 6a40fdeb5 arm/neon/rnd: use correct SVML function for simde_vrndq_f64 9a0705b06 svml: simde_mm256_{clog,csqrt}_ps native reqs AVX not SSE c298a7ec2 msvc avx512/roundscale_round: quiet a false positive warning 01d9c5def sse: remove errant MMX requirement from simde_mm_movemask_ps c675aa08d x86/avx{,2}: use SIMDE_FLOAT{32,64}_C to fix warnings from msvc 097af509e msvc 2022: enable F16C if AVX2 present 91cd7b64b avx{,2}: fix maskload illegal mem access 2caa25b85 Fixed simde_mm_prefetch warnings 96bdf5234 Fixed parameters to _mm_clflush 4d560e418 emscripten; don't use __builtin_roundeven{f,} even if defined 511a01e7d avx512/compress: Mitigate poor compressstore performance on AMD Zen 4 a22b63dc9 avx512/{knot,kxor,cmp,cmpeq,compress,cvt,loadu,shuffle,storeu} Additional AVX512{F,BW,VBMI2,VL} ops 3d87469f6 wasm simd128: correct trunc_sat _FAST_CONVERSION_RANGE target type 56ca5bd89 Suppress min/max macro definitions from windows.h f2cea4d33 arm/neon/qdmulh s390 gcc-12: __builtin_shufflevector is misbehaving 3698cef9b neon/cvt: clang bug 46844 was fixed in clang 12.0 9369cea4a simd128: clang 13 fixed bugs affecting simde_wasm_{v128_load8_lane,i64x2_load32x2} ce27bd09a gcc power: vec_cpsgn argument reversal fixed in 12.0 20fd5b94b gcc power: bugs 1007[012] fixed in GCC 12.1 5e25de133 gcc sse2: bug 99754 was fixed in GCC 12.1 e69796025 gcc i686 mm*_dpbf16_ps: skip vector ops due to rounding error 359c3ff47 clang wasm simde: add workaround to fix wasm_i64x2_shl bug b767f5edc arm/neon: workaround on ARM64 windows bug 599b1fbf4 mips/msa: fix for Windows ARM64 c6f4821ed arm64 windows: fix simd128.h build error 782e7c73e prepare to release 0.7.4 6e9ac2457 fix A32V7 version of _mm_test{nz,}c_si128 776f7a699 test with Debian default flags, also for armel a240d951a x86: fix AVX native → SSE4.2 native 5a73c2ce5 _mm_insert_ps: incorrect handling of the control 597a1c9e4 neon/ld1[q]_*_x2: initial implementation 4550faeac wasm: f32x4 and f64x2 nearest roundeven 5e0686459 Add missing `static const` in simde-math.h. NFC da02f2cee avx512/setzero: fix native aliases 89762e11b Fixed FMA detection macro on msvc b0fda5cf2 avx512/load_pd: initial implementation a61af0778 avx512/load_ps: initial implementation 4126bde01 Properly map __mm functions to __simde_mm 2e76b7a69 neon ld2: gcc-12 fixes 604a53de3 fix wrong size e5e085ff8 AVX: add native calls for _mm256_insertf128_{pd,ps,si256} ee3bd005b aarch64 + clang-1[345] fix for "implicit conversion changes signedness" a060c461a wasm: load lane memcpy instead of cast to address UBSAN issues git-subtree-dir: lib/simde/simde git-subtree-split: 0faa907b261001f89ac89becaea20beddd675468 --- arm/neon.h | 115 +- arm/neon/abal.h | 178 ++ arm/neon/abal_high.h | 125 + arm/neon/abd.h | 140 + arm/neon/abdl_high.h | 181 ++ arm/neon/abs.h | 129 +- arm/neon/add.h | 283 ++- arm/neon/addhn_high.h | 124 + arm/neon/addl.h | 43 + arm/neon/addl_high.h | 55 + arm/neon/addlv.h | 209 +- arm/neon/addv.h | 278 +- arm/neon/addw.h | 43 +- arm/neon/addw_high.h | 100 +- arm/neon/aes.h | 222 ++ arm/neon/and.h | 32 + arm/neon/bcax.h | 105 +- arm/neon/bic.h | 113 +- arm/neon/bsl.h | 157 +- arm/neon/cadd_rot270.h | 231 ++ arm/neon/cadd_rot90.h | 231 ++ arm/neon/cage.h | 9 +- arm/neon/cagt.h | 9 +- arm/neon/cale.h | 168 ++ arm/neon/calt.h | 168 ++ arm/neon/ceq.h | 106 +- arm/neon/ceqz.h | 68 +- arm/neon/cge.h | 127 +- arm/neon/cgez.h | 62 + arm/neon/cgt.h | 170 +- arm/neon/cgtz.h | 62 + arm/neon/cle.h | 167 +- arm/neon/clez.h | 173 +- arm/neon/clt.h | 170 +- arm/neon/cltz.h | 62 + arm/neon/cmla.h | 113 +- arm/neon/cmla_lane.h | 386 +++ arm/neon/cmla_rot180.h | 117 +- arm/neon/cmla_rot180_lane.h | 422 ++++ arm/neon/cmla_rot270.h | 109 +- arm/neon/cmla_rot270_lane.h | 421 ++++ arm/neon/cmla_rot90.h | 117 +- arm/neon/cmla_rot90_lane.h | 420 +++ arm/neon/cnt.h | 62 +- arm/neon/combine.h | 176 +- arm/neon/copy_lane.h | 1200 +++++++++ arm/neon/crc32.h | 295 +++ arm/neon/create.h | 77 +- arm/neon/cvt.h | 1315 +++++++++- arm/neon/cvt_n.h | 703 ++++++ arm/neon/cvtm.h | 389 +++ arm/neon/cvtn.h | 538 ++++ arm/neon/cvtp.h | 387 +++ arm/neon/div.h | 202 ++ arm/neon/dot.h | 217 +- arm/neon/dot_lane.h | 470 +++- arm/neon/dup_lane.h | 515 ++++ arm/neon/dup_n.h | 319 ++- arm/neon/eor.h | 292 +++ arm/neon/ext.h | 599 ++++- arm/neon/fma.h | 111 +- arm/neon/fma_lane.h | 158 +- arm/neon/fma_n.h | 51 +- arm/neon/fmlal.h | 557 ++++ arm/neon/fmlsl.h | 397 +++ arm/neon/fms.h | 195 ++ arm/neon/fms_lane.h | 334 +++ arm/neon/fms_n.h | 174 ++ arm/neon/get_high.h | 155 +- arm/neon/get_lane.h | 208 ++ arm/neon/get_low.h | 142 +- arm/neon/hadd.h | 60 + arm/neon/hsub.h | 61 + arm/neon/ld1.h | 345 ++- arm/neon/ld1_dup.h | 141 ++ arm/neon/ld1_lane.h | 168 ++ arm/neon/ld1_x2.h | 484 ++++ arm/neon/ld1_x3.h | 514 ++++ arm/neon/ld1_x4.h | 545 ++++ arm/neon/ld1q_x2.h | 486 ++++ arm/neon/ld1q_x3.h | 514 ++++ arm/neon/ld1q_x4.h | 544 ++++ arm/neon/ld2.h | 626 ++++- arm/neon/ld2_dup.h | 620 +++++ arm/neon/ld2_lane.h | 642 +++++ arm/neon/ld3.h | 634 ++++- arm/neon/ld3_dup.h | 616 +++++ arm/neon/ld3_lane.h | 642 +++++ arm/neon/ld4.h | 557 +++- arm/neon/ld4_dup.h | 617 +++++ arm/neon/ld4_lane.h | 236 ++ arm/neon/max.h | 303 ++- arm/neon/maxnm.h | 92 +- arm/neon/maxnmv.h | 203 ++ arm/neon/maxv.h | 67 + arm/neon/min.h | 233 +- arm/neon/minnm.h | 92 +- arm/neon/minnmv.h | 224 ++ arm/neon/minv.h | 67 + arm/neon/mla.h | 135 +- arm/neon/mla_lane.h | 241 ++ arm/neon/mla_n.h | 49 +- arm/neon/mlal.h | 55 + arm/neon/mlal_high.h | 55 + arm/neon/mlal_high_lane.h | 147 ++ arm/neon/mlal_high_n.h | 29 + arm/neon/mlal_n.h | 30 +- arm/neon/mls.h | 125 +- arm/neon/mls_lane.h | 240 ++ arm/neon/mls_n.h | 71 + arm/neon/mlsl.h | 55 + arm/neon/mlsl_high.h | 67 + arm/neon/mlsl_high_lane.h | 147 ++ arm/neon/mlsl_high_n.h | 33 + arm/neon/mlsl_n.h | 29 + arm/neon/mmlaq.h | 158 ++ arm/neon/movl.h | 31 +- arm/neon/movn.h | 25 +- arm/neon/mul.h | 227 +- arm/neon/mul_lane.h | 425 +++- arm/neon/mul_n.h | 29 + arm/neon/mull.h | 57 + arm/neon/mull_high.h | 53 + arm/neon/mull_high_lane.h | 170 ++ arm/neon/mull_high_n.h | 98 + arm/neon/mulx.h | 237 ++ arm/neon/mulx_lane.h | 455 ++++ arm/neon/mulx_n.h | 69 + arm/neon/mvn.h | 71 + arm/neon/neg.h | 61 + arm/neon/padd.h | 31 +- arm/neon/paddl.h | 2 +- arm/neon/pmax.h | 31 +- arm/neon/pmaxnm.h | 142 ++ arm/neon/pmin.h | 29 + arm/neon/pminnm.h | 142 ++ arm/neon/qabs.h | 2 +- arm/neon/qadd.h | 32 + arm/neon/qdmlal.h | 98 + arm/neon/qdmlal_high.h | 69 + arm/neon/qdmlal_high_lane.h | 82 + arm/neon/qdmlal_high_n.h | 69 + arm/neon/qdmlal_lane.h | 122 + arm/neon/qdmlal_n.h | 69 + arm/neon/qdmlsl.h | 98 + arm/neon/qdmlsl_high.h | 70 + arm/neon/qdmlsl_high_lane.h | 82 + arm/neon/qdmlsl_high_n.h | 70 + arm/neon/qdmlsl_lane.h | 122 + arm/neon/qdmlsl_n.h | 69 + arm/neon/qdmulh.h | 22 +- arm/neon/qdmulh_lane.h | 23 + arm/neon/qdmull.h | 39 +- arm/neon/qdmull_high.h | 69 + arm/neon/qdmull_high_lane.h | 107 + arm/neon/qdmull_high_n.h | 70 + arm/neon/qdmull_lane.h | 206 ++ arm/neon/qdmull_n.h | 69 + arm/neon/qmovun_high.h | 84 + arm/neon/qrdmlah.h | 186 ++ arm/neon/qrdmlah_lane.h | 162 ++ arm/neon/qrdmlsh.h | 186 ++ arm/neon/qrdmlsh_lane.h | 162 ++ arm/neon/qrdmulh.h | 31 +- arm/neon/qrdmulh_lane.h | 21 + arm/neon/qrshl.h | 750 ++++++ arm/neon/qrshrn_high_n.h | 189 ++ arm/neon/qrshrn_n.h | 21 + arm/neon/qrshrun_high_n.h | 113 + arm/neon/qrshrun_n.h | 29 +- arm/neon/qshl.h | 107 +- arm/neon/qshl_n.h | 513 ++++ arm/neon/qshlu_n.h | 64 +- arm/neon/qshrn_high_n.h | 101 + arm/neon/qshrn_n.h | 21 + arm/neon/qshrun_high_n.h | 113 + arm/neon/qshrun_n.h | 11 + arm/neon/qsub.h | 45 +- arm/neon/qtbl.h | 254 ++ arm/neon/qtbx.h | 258 ++ arm/neon/raddhn.h | 182 ++ arm/neon/raddhn_high.h | 102 + arm/neon/rax.h | 64 + arm/neon/rbit.h | 44 + arm/neon/recpe.h | 92 +- arm/neon/recps.h | 64 + arm/neon/recpx.h | 133 + arm/neon/reinterpret.h | 4490 ++++++++++++++++++++++++++++++++- arm/neon/rev16.h | 36 + arm/neon/rev32.h | 72 +- arm/neon/rev64.h | 107 +- arm/neon/rnd.h | 63 +- arm/neon/rnd32x.h | 160 ++ arm/neon/rnd32z.h | 160 ++ arm/neon/rnd64x.h | 160 ++ arm/neon/rnd64z.h | 160 ++ arm/neon/rnda.h | 242 ++ arm/neon/rndi.h | 69 +- arm/neon/rndm.h | 61 + arm/neon/rndn.h | 76 +- arm/neon/rndp.h | 64 + arm/neon/rndx.h | 194 ++ arm/neon/rshl.h | 42 +- arm/neon/rshr_n.h | 19 +- arm/neon/rshrn_high_n.h | 101 + arm/neon/rsqrte.h | 87 +- arm/neon/rsqrts.h | 64 + arm/neon/rsubhn.h | 209 ++ arm/neon/rsubhn_high.h | 102 + arm/neon/set_lane.h | 195 ++ arm/neon/sha1.h | 208 ++ arm/neon/sha256.h | 197 ++ arm/neon/sha512.h | 157 ++ arm/neon/shl.h | 408 ++- arm/neon/shl_n.h | 50 +- arm/neon/shll_high_n.h | 180 ++ arm/neon/shll_n.h | 127 +- arm/neon/shr_n.h | 76 +- arm/neon/shrn_high_n.h | 114 + arm/neon/shrn_n.h | 73 +- arm/neon/sli_n.h | 343 +++ arm/neon/sm3.h | 267 ++ arm/neon/sm4.h | 157 ++ arm/neon/sqadd.h | 165 +- arm/neon/sqrt.h | 225 ++ arm/neon/sri_n.h | 73 + arm/neon/st1.h | 261 +- arm/neon/st1_lane.h | 167 ++ arm/neon/st1_x2.h | 326 +++ arm/neon/st1_x3.h | 341 +++ arm/neon/st1_x4.h | 370 +++ arm/neon/st1q_x2.h | 324 +++ arm/neon/st1q_x3.h | 339 +++ arm/neon/st1q_x4.h | 354 +++ arm/neon/st2.h | 537 +++- arm/neon/st2_lane.h | 196 ++ arm/neon/st3.h | 495 +++- arm/neon/st3_lane.h | 198 +- arm/neon/st4.h | 685 ++++- arm/neon/st4_lane.h | 195 ++ arm/neon/sub.h | 70 + arm/neon/subhn_high.h | 102 + arm/neon/subl.h | 37 + arm/neon/subl_high.h | 49 + arm/neon/subw.h | 37 +- arm/neon/subw_high.h | 44 +- arm/neon/sudot_lane.h | 169 ++ arm/neon/tbl.h | 95 + arm/neon/tbx.h | 103 + arm/neon/trn.h | 93 + arm/neon/trn1.h | 192 ++ arm/neon/trn2.h | 192 ++ arm/neon/tst.h | 97 + arm/neon/types.h | 507 +++- arm/neon/uqadd.h | 12 + arm/neon/usdot.h | 95 + arm/neon/usdot_lane.h | 169 ++ arm/neon/uzp.h | 93 + arm/neon/uzp1.h | 207 ++ arm/neon/uzp2.h | 207 ++ arm/neon/xar.h | 4 +- arm/neon/zip.h | 93 + arm/neon/zip1.h | 197 ++ arm/neon/zip2.h | 185 ++ arm/sve/and.h | 12 +- arm/sve/cmplt.h | 60 +- arm/sve/ld1.h | 60 +- arm/sve/ptest.h | 2 +- arm/sve/ptrue.h | 8 +- arm/sve/sel.h | 54 +- arm/sve/st1.h | 96 +- arm/sve/types.h | 2 +- arm/sve/whilelt.h | 100 +- check.h | 2 +- debug-trap.h | 2 +- hedley.h | 1 + mips/msa/adds.h | 4 +- mips/msa/ld.h | 130 +- mips/msa/madd.h | 10 +- mips/msa/st.h | 36 +- mips/msa/types.h | 36 +- simde-aes.h | 270 ++ simde-align.h | 6 +- simde-arch.h | 128 +- simde-bf16.h | 131 + simde-common.h | 212 +- simde-complex.h | 2 +- simde-detect-clang.h | 28 +- simde-diagnostic.h | 4 +- simde-f16.h | 126 +- simde-features.h | 103 +- simde-math.h | 191 +- wasm/relaxed-simd.h | 242 +- wasm/simd128.h | 982 ++++--- x86/aes.h | 417 +++ x86/avx.h | 1478 ++++++++--- x86/avx2.h | 560 +++- x86/avx512.h | 10 + x86/avx512/2intersect.h | 59 +- x86/avx512/abs.h | 4 +- x86/avx512/add.h | 18 +- x86/avx512/cast.h | 33 + x86/avx512/cmp.h | 1198 ++++++++- x86/avx512/cmpeq.h | 48 + x86/avx512/cmpge.h | 104 +- x86/avx512/cmpgt.h | 23 + x86/avx512/cmple.h | 104 +- x86/avx512/cmpneq.h | 32 +- x86/avx512/compress.h | 122 +- x86/avx512/cvt.h | 155 +- x86/avx512/cvts.h | 58 + x86/avx512/cvtt.h | 26 + x86/avx512/cvtus.h | 67 + x86/avx512/dpbf16.h | 6 +- x86/avx512/extract.h | 89 +- x86/avx512/fmaddsub.h | 91 + x86/avx512/fmsub.h | 24 +- x86/avx512/fpclass.h | 99 + x86/avx512/gather.h | 312 +++ x86/avx512/insert.h | 24 +- x86/avx512/kand.h | 53 + x86/avx512/knot.h | 106 + x86/avx512/kxor.h | 107 + x86/avx512/load.h | 48 + x86/avx512/loadu.h | 226 +- x86/avx512/madd.h | 8 +- x86/avx512/maddubs.h | 12 +- x86/avx512/max.h | 24 + x86/avx512/min.h | 24 + x86/avx512/mov.h | 6 + x86/avx512/multishift.h | 6 +- x86/avx512/permutex.h | 101 + x86/avx512/permutex2var.h | 8 +- x86/avx512/permutexvar.h | 14 + x86/avx512/range.h | 16 +- x86/avx512/range_round.h | 16 +- x86/avx512/rcp.h | 65 + x86/avx512/reduce.h | 355 +++ x86/avx512/rol.h | 2 +- x86/avx512/ror.h | 2 +- x86/avx512/round.h | 4 +- x86/avx512/roundscale.h | 24 +- x86/avx512/roundscale_round.h | 52 +- x86/avx512/set.h | 86 +- x86/avx512/set1.h | 21 + x86/avx512/setone.h | 6 + x86/avx512/setzero.h | 23 +- x86/avx512/shuffle.h | 176 +- x86/avx512/sll.h | 2 +- x86/avx512/srai.h | 26 + x86/avx512/storeu.h | 131 +- x86/avx512/types.h | 161 +- x86/avx512/xorsign.h | 2 +- x86/clmul.h | 125 +- x86/f16c.h | 122 +- x86/fma.h | 75 +- x86/gfni.h | 2 +- x86/mmx.h | 11 +- x86/sse.h | 615 ++++- x86/sse2.h | 1150 +++++++-- x86/sse3.h | 62 +- x86/sse4.1.h | 470 +++- x86/sse4.2.h | 41 +- x86/ssse3.h | 56 + x86/svml.h | 48 +- x86/xop.h | 2 +- 366 files changed, 63573 insertions(+), 4013 deletions(-) create mode 100644 arm/neon/abal.h create mode 100644 arm/neon/abal_high.h create mode 100644 arm/neon/abdl_high.h create mode 100644 arm/neon/addhn_high.h create mode 100644 arm/neon/aes.h create mode 100644 arm/neon/cadd_rot270.h create mode 100644 arm/neon/cadd_rot90.h create mode 100644 arm/neon/cale.h create mode 100644 arm/neon/calt.h create mode 100644 arm/neon/cmla_lane.h create mode 100644 arm/neon/cmla_rot180_lane.h create mode 100644 arm/neon/cmla_rot270_lane.h create mode 100644 arm/neon/cmla_rot90_lane.h create mode 100644 arm/neon/copy_lane.h create mode 100644 arm/neon/crc32.h create mode 100644 arm/neon/cvt_n.h create mode 100644 arm/neon/cvtm.h create mode 100644 arm/neon/cvtn.h create mode 100644 arm/neon/cvtp.h create mode 100644 arm/neon/div.h create mode 100644 arm/neon/fmlal.h create mode 100644 arm/neon/fmlsl.h create mode 100644 arm/neon/fms.h create mode 100644 arm/neon/fms_lane.h create mode 100644 arm/neon/fms_n.h create mode 100644 arm/neon/ld1_x2.h create mode 100644 arm/neon/ld1_x3.h create mode 100644 arm/neon/ld1_x4.h create mode 100644 arm/neon/ld1q_x2.h create mode 100644 arm/neon/ld1q_x3.h create mode 100644 arm/neon/ld1q_x4.h create mode 100644 arm/neon/ld2_dup.h create mode 100644 arm/neon/ld2_lane.h create mode 100644 arm/neon/ld3_dup.h create mode 100644 arm/neon/ld3_lane.h create mode 100644 arm/neon/ld4_dup.h create mode 100644 arm/neon/maxnmv.h create mode 100644 arm/neon/minnmv.h create mode 100644 arm/neon/mla_lane.h create mode 100644 arm/neon/mlal_high_lane.h create mode 100644 arm/neon/mls_lane.h create mode 100644 arm/neon/mlsl_high_lane.h create mode 100644 arm/neon/mmlaq.h create mode 100644 arm/neon/mull_high_lane.h create mode 100644 arm/neon/mull_high_n.h create mode 100644 arm/neon/mulx.h create mode 100644 arm/neon/mulx_lane.h create mode 100644 arm/neon/mulx_n.h create mode 100644 arm/neon/pmaxnm.h create mode 100644 arm/neon/pminnm.h create mode 100644 arm/neon/qdmlal.h create mode 100644 arm/neon/qdmlal_high.h create mode 100644 arm/neon/qdmlal_high_lane.h create mode 100644 arm/neon/qdmlal_high_n.h create mode 100644 arm/neon/qdmlal_lane.h create mode 100644 arm/neon/qdmlal_n.h create mode 100644 arm/neon/qdmlsl.h create mode 100644 arm/neon/qdmlsl_high.h create mode 100644 arm/neon/qdmlsl_high_lane.h create mode 100644 arm/neon/qdmlsl_high_n.h create mode 100644 arm/neon/qdmlsl_lane.h create mode 100644 arm/neon/qdmlsl_n.h create mode 100644 arm/neon/qdmull_high.h create mode 100644 arm/neon/qdmull_high_lane.h create mode 100644 arm/neon/qdmull_high_n.h create mode 100644 arm/neon/qdmull_lane.h create mode 100644 arm/neon/qdmull_n.h create mode 100644 arm/neon/qmovun_high.h create mode 100644 arm/neon/qrdmlah.h create mode 100644 arm/neon/qrdmlah_lane.h create mode 100644 arm/neon/qrdmlsh.h create mode 100644 arm/neon/qrdmlsh_lane.h create mode 100644 arm/neon/qrshl.h create mode 100644 arm/neon/qrshrn_high_n.h create mode 100644 arm/neon/qrshrun_high_n.h create mode 100644 arm/neon/qshl_n.h create mode 100644 arm/neon/qshrn_high_n.h create mode 100644 arm/neon/qshrun_high_n.h create mode 100644 arm/neon/raddhn.h create mode 100644 arm/neon/raddhn_high.h create mode 100644 arm/neon/rax.h create mode 100644 arm/neon/recpx.h create mode 100644 arm/neon/rnd32x.h create mode 100644 arm/neon/rnd32z.h create mode 100644 arm/neon/rnd64x.h create mode 100644 arm/neon/rnd64z.h create mode 100644 arm/neon/rnda.h create mode 100644 arm/neon/rndx.h create mode 100644 arm/neon/rshrn_high_n.h create mode 100644 arm/neon/rsubhn.h create mode 100644 arm/neon/rsubhn_high.h create mode 100644 arm/neon/sha1.h create mode 100644 arm/neon/sha256.h create mode 100644 arm/neon/sha512.h create mode 100644 arm/neon/shll_high_n.h create mode 100644 arm/neon/shrn_high_n.h create mode 100644 arm/neon/sli_n.h create mode 100644 arm/neon/sm3.h create mode 100644 arm/neon/sm4.h create mode 100644 arm/neon/sqrt.h create mode 100644 arm/neon/st1_x2.h create mode 100644 arm/neon/st1_x3.h create mode 100644 arm/neon/st1_x4.h create mode 100644 arm/neon/st1q_x2.h create mode 100644 arm/neon/st1q_x3.h create mode 100644 arm/neon/st1q_x4.h create mode 100644 arm/neon/subhn_high.h create mode 100644 arm/neon/sudot_lane.h create mode 100644 arm/neon/usdot.h create mode 100644 arm/neon/usdot_lane.h create mode 100644 simde-aes.h create mode 100644 simde-bf16.h create mode 100644 x86/aes.h create mode 100644 x86/avx512/cvtus.h create mode 100644 x86/avx512/fmaddsub.h create mode 100644 x86/avx512/fpclass.h create mode 100644 x86/avx512/gather.h create mode 100644 x86/avx512/kand.h create mode 100644 x86/avx512/knot.h create mode 100644 x86/avx512/kxor.h create mode 100644 x86/avx512/permutex.h create mode 100644 x86/avx512/rcp.h create mode 100644 x86/avx512/reduce.h diff --git a/arm/neon.h b/arm/neon.h index df91b0d93..5835db696 100644 --- a/arm/neon.h +++ b/arm/neon.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_H) @@ -30,23 +31,32 @@ #include "neon/types.h" #include "neon/aba.h" +#include "neon/abal.h" +#include "neon/abal_high.h" #include "neon/abd.h" #include "neon/abdl.h" +#include "neon/abdl_high.h" #include "neon/abs.h" #include "neon/add.h" #include "neon/addhn.h" +#include "neon/addhn_high.h" #include "neon/addl.h" #include "neon/addlv.h" #include "neon/addl_high.h" #include "neon/addv.h" #include "neon/addw.h" #include "neon/addw_high.h" +#include "neon/aes.h" #include "neon/and.h" #include "neon/bcax.h" #include "neon/bic.h" #include "neon/bsl.h" +#include "neon/cadd_rot270.h" +#include "neon/cadd_rot90.h" #include "neon/cage.h" #include "neon/cagt.h" +#include "neon/cale.h" +#include "neon/calt.h" #include "neon/ceq.h" #include "neon/ceqz.h" #include "neon/cge.h" @@ -60,13 +70,24 @@ #include "neon/cltz.h" #include "neon/clz.h" #include "neon/cmla.h" -#include "neon/cmla_rot90.h" +#include "neon/cmla_lane.h" #include "neon/cmla_rot180.h" +#include "neon/cmla_rot180_lane.h" #include "neon/cmla_rot270.h" +#include "neon/cmla_rot270_lane.h" +#include "neon/cmla_rot90.h" +#include "neon/cmla_rot90_lane.h" #include "neon/cnt.h" #include "neon/cvt.h" +#include "neon/cvt_n.h" +#include "neon/cvtm.h" +#include "neon/cvtn.h" +#include "neon/cvtp.h" #include "neon/combine.h" +#include "neon/copy_lane.h" +#include "neon/crc32.h" #include "neon/create.h" +#include "neon/div.h" #include "neon/dot.h" #include "neon/dot_lane.h" #include "neon/dup_lane.h" @@ -76,6 +97,11 @@ #include "neon/fma.h" #include "neon/fma_lane.h" #include "neon/fma_n.h" +#include "neon/fmlal.h" +#include "neon/fmlsl.h" +#include "neon/fms.h" +#include "neon/fms_lane.h" +#include "neon/fms_n.h" #include "neon/get_high.h" #include "neon/get_lane.h" #include "neon/get_low.h" @@ -84,30 +110,48 @@ #include "neon/ld1.h" #include "neon/ld1_dup.h" #include "neon/ld1_lane.h" +#include "neon/ld1_x2.h" +#include "neon/ld1_x3.h" +#include "neon/ld1_x4.h" +#include "neon/ld1q_x2.h" +#include "neon/ld1q_x3.h" +#include "neon/ld1q_x4.h" #include "neon/ld2.h" +#include "neon/ld2_dup.h" +#include "neon/ld2_lane.h" #include "neon/ld3.h" +#include "neon/ld3_dup.h" +#include "neon/ld3_lane.h" #include "neon/ld4.h" +#include "neon/ld4_dup.h" #include "neon/ld4_lane.h" #include "neon/max.h" #include "neon/maxnm.h" +#include "neon/maxnmv.h" #include "neon/maxv.h" #include "neon/min.h" #include "neon/minnm.h" +#include "neon/minnmv.h" #include "neon/minv.h" #include "neon/mla.h" +#include "neon/mla_lane.h" #include "neon/mla_n.h" #include "neon/mlal.h" #include "neon/mlal_high.h" +#include "neon/mlal_high_lane.h" #include "neon/mlal_high_n.h" #include "neon/mlal_lane.h" #include "neon/mlal_n.h" #include "neon/mls.h" +#include "neon/mls_lane.h" #include "neon/mls_n.h" #include "neon/mlsl.h" #include "neon/mlsl_high.h" +#include "neon/mlsl_high_lane.h" #include "neon/mlsl_high_n.h" #include "neon/mlsl_lane.h" #include "neon/mlsl_n.h" +#include "neon/mmlaq.h" #include "neon/movl.h" #include "neon/movl_high.h" #include "neon/movn.h" @@ -117,8 +161,13 @@ #include "neon/mul_n.h" #include "neon/mull.h" #include "neon/mull_high.h" +#include "neon/mull_high_lane.h" +#include "neon/mull_high_n.h" #include "neon/mull_lane.h" #include "neon/mull_n.h" +#include "neon/mulx.h" +#include "neon/mulx_lane.h" +#include "neon/mulx_n.h" #include "neon/mvn.h" #include "neon/neg.h" #include "neon/orn.h" @@ -127,59 +176,117 @@ #include "neon/padd.h" #include "neon/paddl.h" #include "neon/pmax.h" +#include "neon/pmaxnm.h" #include "neon/pmin.h" +#include "neon/pminnm.h" #include "neon/qabs.h" #include "neon/qadd.h" +#include "neon/qdmlal.h" +#include "neon/qdmlal_high.h" +#include "neon/qdmlal_high_lane.h" +#include "neon/qdmlal_high_n.h" +#include "neon/qdmlal_lane.h" +#include "neon/qdmlal_n.h" +#include "neon/qdmlsl.h" +#include "neon/qdmlsl_high.h" +#include "neon/qdmlsl_high_lane.h" +#include "neon/qdmlsl_high_n.h" +#include "neon/qdmlsl_lane.h" +#include "neon/qdmlsl_n.h" #include "neon/qdmulh.h" #include "neon/qdmulh_lane.h" #include "neon/qdmulh_n.h" #include "neon/qdmull.h" +#include "neon/qdmull_high.h" +#include "neon/qdmull_high_lane.h" +#include "neon/qdmull_high_n.h" +#include "neon/qdmull_lane.h" +#include "neon/qdmull_n.h" +#include "neon/qrdmlah.h" +#include "neon/qrdmlah_lane.h" +#include "neon/qrdmlsh.h" +#include "neon/qrdmlsh_lane.h" #include "neon/qrdmulh.h" #include "neon/qrdmulh_lane.h" #include "neon/qrdmulh_n.h" +#include "neon/qrshl.h" +#include "neon/qrshrn_high_n.h" #include "neon/qrshrn_n.h" +#include "neon/qrshrun_high_n.h" #include "neon/qrshrun_n.h" #include "neon/qmovn.h" -#include "neon/qmovun.h" #include "neon/qmovn_high.h" +#include "neon/qmovun.h" +#include "neon/qmovun_high.h" #include "neon/qneg.h" #include "neon/qsub.h" #include "neon/qshl.h" +#include "neon/qshl_n.h" #include "neon/qshlu_n.h" +#include "neon/qshrn_high_n.h" #include "neon/qshrn_n.h" +#include "neon/qshrun_high_n.h" #include "neon/qshrun_n.h" #include "neon/qtbl.h" #include "neon/qtbx.h" +#include "neon/raddhn.h" +#include "neon/raddhn_high.h" +#include "neon/rax.h" #include "neon/rbit.h" #include "neon/recpe.h" #include "neon/recps.h" +#include "neon/recpx.h" #include "neon/reinterpret.h" #include "neon/rev16.h" #include "neon/rev32.h" #include "neon/rev64.h" #include "neon/rhadd.h" #include "neon/rnd.h" +#include "neon/rnd32x.h" +#include "neon/rnd32z.h" +#include "neon/rnd64x.h" +#include "neon/rnd64z.h" +#include "neon/rnda.h" #include "neon/rndm.h" #include "neon/rndi.h" #include "neon/rndn.h" #include "neon/rndp.h" +#include "neon/rndx.h" #include "neon/rshl.h" #include "neon/rshr_n.h" +#include "neon/rshrn_high_n.h" #include "neon/rshrn_n.h" #include "neon/rsqrte.h" #include "neon/rsqrts.h" #include "neon/rsra_n.h" +#include "neon/rsubhn.h" +#include "neon/rsubhn_high.h" #include "neon/set_lane.h" +#include "neon/sha1.h" +#include "neon/sha256.h" +#include "neon/sha512.h" #include "neon/shl.h" #include "neon/shl_n.h" +#include "neon/shll_high_n.h" #include "neon/shll_n.h" #include "neon/shr_n.h" +#include "neon/shrn_high_n.h" #include "neon/shrn_n.h" +#include "neon/sli_n.h" +#include "neon/sm3.h" +#include "neon/sm4.h" #include "neon/sqadd.h" +#include "neon/sqrt.h" #include "neon/sra_n.h" #include "neon/sri_n.h" #include "neon/st1.h" #include "neon/st1_lane.h" +#include "neon/st1_x2.h" +#include "neon/st1_x3.h" +#include "neon/st1_x4.h" +#include "neon/st1q_x2.h" +#include "neon/st1q_x3.h" +#include "neon/st1q_x4.h" #include "neon/st2.h" #include "neon/st2_lane.h" #include "neon/st3.h" @@ -188,10 +295,12 @@ #include "neon/st4_lane.h" #include "neon/sub.h" #include "neon/subhn.h" +#include "neon/subhn_high.h" #include "neon/subl.h" #include "neon/subl_high.h" #include "neon/subw.h" #include "neon/subw_high.h" +#include "neon/sudot_lane.h" #include "neon/tbl.h" #include "neon/tbx.h" #include "neon/trn.h" @@ -199,6 +308,8 @@ #include "neon/trn2.h" #include "neon/tst.h" #include "neon/uqadd.h" +#include "neon/usdot.h" +#include "neon/usdot_lane.h" #include "neon/uzp.h" #include "neon/uzp1.h" #include "neon/uzp2.h" diff --git a/arm/neon/abal.h b/arm/neon/abal.h new file mode 100644 index 000000000..e3af088f7 --- /dev/null +++ b/arm/neon/abal.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ABAL_H) +#define SIMDE_ARM_NEON_ABAL_H + +#include "abdl.h" +#include "add.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_, a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , \ + __riscv_vlmul_trunc_v_i8m1_i8mf2(c_.sv64) , 8); + r_.sv128 = __riscv_vadd_vv_i16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8), a_.sv128, 8); + return simde_int16x8_from_private(r_); + #else + return simde_vaddq_s16(simde_vabdl_s8(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s8 + #define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_, a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(c_.sv64) , 4); + r_.sv128 = __riscv_vadd_vv_i32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4), a_.sv128, 4); + return simde_int32x4_from_private(r_); + #else + return simde_vaddq_s32(simde_vabdl_s16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s16 + #define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_, a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(c_.sv64) , 2); + r_.sv128 = __riscv_vadd_vv_i64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2), a_.sv128, 2); + return simde_int64x2_from_private(r_); + #else + return simde_vaddq_s64(simde_vabdl_s32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s32 + #define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_, a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64), 8)); + vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(c_.sv64), 8)); + vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8); + r_.sv128 = __riscv_vadd_vv_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)), \ + a_.sv128, 8); + return simde_uint16x8_from_private(r_); + #else + return simde_vaddq_u16(simde_vabdl_u8(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u8 + #define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_, a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64), 4)); + vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(c_.sv64), 4)); + vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vadd_vv_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)), \ + a_.sv128, 4); + return simde_uint32x4_from_private(r_); + #else + return simde_vaddq_u32(simde_vabdl_u16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u16 + #define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_, a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64), 2)); + vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(c_.sv64), 2)); + vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vadd_vv_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)), \ + a_.sv128, 2); + return simde_uint64x2_from_private(r_); + #else + return simde_vaddq_u64(simde_vabdl_u32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u32 + #define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_abal_H) */ diff --git a/arm/neon/abal_high.h b/arm/neon/abal_high.h new file mode 100644 index 000000000..78f538dc4 --- /dev/null +++ b/arm/neon/abal_high.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ABAL_HIGH_H) +#define SIMDE_ARM_NEON_ABAL_HIGH_H + +#include "abdl.h" +#include "add.h" +#include "movl_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s8(a, b, c); + #else + return simde_vaddq_s16(simde_vabdl_s8(simde_vget_high_s8(b), simde_vget_high_s8(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s8 + #define vabal_high_s8(a, b, c) simde_vabal_high_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s16(a, b, c); + #else + return simde_vaddq_s32(simde_vabdl_s16(simde_vget_high_s16(b), simde_vget_high_s16(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s16 + #define vabal_high_s16(a, b, c) simde_vabal_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s32(a, b, c); + #else + return simde_vaddq_s64(simde_vabdl_s32(simde_vget_high_s32(b), simde_vget_high_s32(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s32 + #define vabal_high_s32(a, b, c) simde_vabal_high_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u8(a, b, c); + #else + return simde_vaddq_u16(simde_vabdl_u8(simde_vget_high_u8(b), simde_vget_high_u8(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u8 + #define vabal_high_u8(a, b, c) simde_vabal_high_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u16(a, b, c); + #else + return simde_vaddq_u32(simde_vabdl_u16(simde_vget_high_u16(b), simde_vget_high_u16(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u16 + #define vabal_high_u16(a, b, c) simde_vabal_high_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u32(a, b, c); + #else + return simde_vaddq_u64(simde_vabdl_u32(simde_vget_high_u32(b), simde_vget_high_u32(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u32 + #define vabal_high_u32(a, b, c) simde_vabal_high_u32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_abal_H) */ diff --git a/arm/neon/abd.h b/arm/neon/abd.h index 0a814e8d9..54dad5550 100644 --- a/arm/neon/abd.h +++ b/arm/neon/abd.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ABD_H) @@ -37,6 +38,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vabdh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabdh_f16(a, b); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_ = a_ - b_; + return r_ < 0 ? simde_float16_from_float32(-r_) : simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vabdh_f16 + #define vabdh_f16(a, b) simde_vabdh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vabds_f32(simde_float32_t a, simde_float32_t b) { @@ -67,6 +85,20 @@ simde_vabdd_f64(simde_float64_t a, simde_float64_t b) { #define vabdd_f64(a, b) simde_vabdd_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vabd_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabd_f16(a, b); + #else + return simde_vabs_f16(simde_vsub_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vabd_f16 + #define vabd_f16(a, b) simde_vabd_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vabd_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -116,6 +148,15 @@ simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) { m ); + return simde_int8x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private r_, max_, min_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8); + min_.sv64 = __riscv_vmin_vv_i8m1(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vsub_vv_i8m1(max_.sv64, min_.sv64, 8); return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b))); @@ -139,6 +180,15 @@ simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) { r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64)); + return simde_int16x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private r_, max_, min_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4); + min_.sv64 = __riscv_vmin_vv_i16m1(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vsub_vv_i16m1(max_.sv64, min_.sv64, 4); return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b))); @@ -154,6 +204,15 @@ simde_int32x2_t simde_vabd_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_, max_, min_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2); + min_.sv64 = __riscv_vmin_vv_i32m1(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vsub_vv_i32m1(max_.sv64, min_.sv64, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vabsq_s64(simde_vsubl_s32(a, b))); #endif @@ -168,6 +227,15 @@ simde_uint8x8_t simde_vabd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private r_, max_, min_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8); + min_.sv64 = __riscv_vminu_vv_u8m1(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vsub_vv_u8m1(max_.sv64, min_.sv64, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16( simde_vreinterpretq_u16_s16( @@ -187,6 +255,15 @@ simde_uint16x4_t simde_vabd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private r_, max_, min_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4); + min_.sv64 = __riscv_vminu_vv_u16m1(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vsub_vv_u16m1(max_.sv64, min_.sv64, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32( simde_vreinterpretq_u32_s32( @@ -206,6 +283,15 @@ simde_uint32x2_t simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_, max_, min_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2); + min_.sv64 = __riscv_vminu_vv_u32m1(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vsub_vv_u32m1(max_.sv64, min_.sv64, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64( simde_vreinterpretq_u64_s64( @@ -220,6 +306,20 @@ simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vabd_u32(a, b) simde_vabd_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vabdq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabdq_f16(a, b); + #else + return simde_vabsq_f16(simde_vsubq_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vabdq_f16 + #define vabdq_f16(a, b) simde_vabdq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vabdq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -277,6 +377,12 @@ simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16); + min_.sv128 = __riscv_vmin_vv_i8m1(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vsub_vv_i8m1(max_.sv128, min_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -313,6 +419,12 @@ simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8); + min_.sv128 = __riscv_vmin_vv_i16m1(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vsub_vv_i16m1(max_.sv128, min_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -348,6 +460,8 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_i32x4_sub(wasm_i32x4_max(a_.v128, b_.v128), wasm_i32x4_min(a_.v128, b_.v128)); #elif defined(SIMDE_X86_SSE2_NATIVE) const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i); r_.m128i = @@ -358,6 +472,12 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) { ), m ); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4); + min_.sv128 = __riscv_vmin_vv_i32m1(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vsub_vv_i32m1(max_.sv128, min_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -395,6 +515,12 @@ simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16); + min_.sv128 = __riscv_vminu_vv_u8m1(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vsub_vv_u8m1(max_.sv128, min_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -432,6 +558,12 @@ simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8); + min_.sv128 = __riscv_vminu_vv_u16m1(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vsub_vv_u16m1(max_.sv128, min_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -467,6 +599,14 @@ simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_i32x4_sub(wasm_u32x4_max(a_.v128, b_.v128), wasm_u32x4_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4); + min_.sv128 = __riscv_vminu_vv_u32m1(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vsub_vv_u32m1(max_.sv128, min_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/abdl_high.h b/arm/neon/abdl_high.h new file mode 100644 index 000000000..4672a5b28 --- /dev/null +++ b/arm/neon/abdl_high.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) +#define SIMDE_ARM_NEON_ABDL_HIGH_H + +#include "abdl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16)), + __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)) , 8); + r_.sv128 = __riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8); + return simde_int16x8_from_private(r_); + #else + return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s8 + #define vabdl_high_s8(a, b) simde_vabdl_high_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8)) , \ + __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)) , 4); + r_.sv128 = __riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4); + return simde_int32x4_from_private(r_); + #else + return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s16 + #define vabdl_high_s16(a, b) simde_vabdl_high_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4)) , \ + __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)) , 2); + r_.sv128 = __riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2); + return simde_int64x2_from_private(r_); + #else + return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s32 + #define vabdl_high_s32(a, b) simde_vabdl_high_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \ + __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16)), 8)); + vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \ + __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)), 8)); + vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8); + r_.sv128 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)); + return simde_uint16x8_from_private(r_); + #else + return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u8 + #define vabdl_high_u8(a, b) simde_vabdl_high_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \ + __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8)), 4)); + vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \ + __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)), 4)); + vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)); + return simde_uint32x4_from_private(r_); + #else + return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u16 + #define vabdl_high_u16(a, b) simde_vabdl_high_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \ + __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4)), 2)); + vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \ + __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)), 2)); + vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)); + return simde_uint64x2_from_private(r_); + #else + return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u32 + #define vabdl_high_u32(a, b) simde_vabdl_high_u32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) */ diff --git a/arm/neon/abs.h b/arm/neon/abs.h index 3c705e98b..20548db8c 100644 --- a/arm/neon/abs.h +++ b/arm/neon/abs.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ABS_H) @@ -42,11 +44,57 @@ simde_vabsd_s64(int64_t a) { return a < 0 ? -a : a; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,1,0))) #undef vabsd_s64 #define vabsd_s64(a) simde_vabsd_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vabsh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabsh_f16(a); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + + return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vabsh_f16 + #define vabsh_f16(a) simde_vabsh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vabs_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabs_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfabs_v_f16m1(a_.sv64 , 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vabs_f16 + #define vabs_f16(a) simde_vabs_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vabs_f32(simde_float32x2_t a) { @@ -57,10 +105,14 @@ simde_vabs_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfabs_v_f32m1(a_.sv64 , 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + } + #endif return simde_float32x2_from_private(r_); #endif @@ -80,10 +132,14 @@ simde_vabs_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfabs_v_f64m1(a_.sv64 , 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + } + #endif return simde_float64x1_from_private(r_); #endif @@ -105,6 +161,8 @@ simde_vabs_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi8(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64 , __riscv_vneg_v_i8m1(a_.sv64 , 8) , 8); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -135,6 +193,8 @@ simde_vabs_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi16(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64 , __riscv_vneg_v_i16m1(a_.sv64 , 4) , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -165,6 +225,8 @@ simde_vabs_s32(simde_int32x2_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi32(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64 , __riscv_vneg_v_i32m1(a_.sv64 , 2) , 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -193,7 +255,9 @@ simde_vabs_s64(simde_int64x1_t a) { r_, a_ = simde_int64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64 , __riscv_vneg_v_i64m1(a_.sv64 , 1) , 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); #else @@ -211,6 +275,34 @@ simde_vabs_s64(simde_int64x1_t a) { #define vabs_s64(a) simde_vabs_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vabsq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabsq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfabs_v_f16m1(a_.sv128 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + #endif + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vabsq_f16 + #define vabsq_f16(a) simde_vabsq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vabsq_f32(simde_float32x4_t a) { @@ -225,6 +317,8 @@ simde_vabsq_f32(simde_float32x4_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfabs_v_f32m1(a_.sv128 , 4); #elif defined(SIMDE_X86_SSE_NATIVE) simde_float32 mask_; uint32_t u32_ = UINT32_C(0x7FFFFFFF); @@ -262,6 +356,8 @@ simde_vabsq_f64(simde_float64x2_t a) { uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); simde_memcpy(&mask_, &u64_, sizeof(u64_)); r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfabs_v_f64m1(a_.sv128 , 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -295,6 +391,8 @@ simde_vabsq_s8(simde_int8x16_t a) { r_.m128i = _mm_min_epu8(a_.m128i, _mm_sub_epi8(_mm_setzero_si128(), a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128 , __riscv_vneg_v_i8m1(a_.sv128 , 16) , 16); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -331,6 +429,8 @@ simde_vabsq_s16(simde_int16x8_t a) { r_.m128i = _mm_max_epi16(a_.m128i, _mm_sub_epi16(_mm_setzero_si128(), a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128 , __riscv_vneg_v_i16m1(a_.sv128 , 8) , 8); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -368,13 +468,15 @@ simde_vabsq_s32(simde_int32x4_t a) { r_.m128i = _mm_sub_epi32(_mm_xor_si128(a_.m128i, m), m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128 , __riscv_vneg_v_i32m1(a_.sv128 , 4) , 4); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int32_t, 0 - HEDLEY_STATIC_CAST(uint32_t, a_.values[i])) : a_.values[i]; } #endif @@ -389,6 +491,7 @@ simde_vabsq_s32(simde_int32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vabsq_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabsq_s64(a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -407,13 +510,15 @@ simde_vabsq_s64(simde_int64x2_t a) { r_.m128i = _mm_sub_epi64(_mm_xor_si128(a_.m128i, m), m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128 , __riscv_vneg_v_i64m1(a_.sv128 , 2) , 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int64_t, 0 - HEDLEY_STATIC_CAST(uint64_t, a_.values[i])) : a_.values[i]; } #endif diff --git a/arm/neon/add.h b/arm/neon/add.h index d3660f660..5f2922042 100644 --- a/arm/neon/add.h +++ b/arm/neon/add.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADD_H) @@ -35,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16 -simde_vaddh_f16(simde_float16 a, simde_float16 b) { +simde_vaddh_f16(simde_float16_t a, simde_float16_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vaddh_f16(a, b); #else @@ -44,7 +46,8 @@ simde_vaddh_f16(simde_float16 a, simde_float16 b) { return simde_float16_from_float32(af + bf); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vaddh_f16 #define vaddh_f16(a, b) simde_vaddh_f16((a), (b)) #endif @@ -88,15 +91,20 @@ simde_vadd_f16(simde_float16x4_t a, simde_float16x4_t b) { a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfadd_vv_f16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vadd_f16 #define vadd_f16(a, b) simde_vadd_f16((a), (b)) #endif @@ -112,7 +120,9 @@ simde_vadd_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -140,7 +150,9 @@ simde_vadd_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -168,7 +180,9 @@ simde_vadd_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi8(a_.m64, b_.m64); @@ -198,7 +212,9 @@ simde_vadd_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi16(a_.m64, b_.m64); @@ -228,7 +244,9 @@ simde_vadd_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi32(a_.m64, b_.m64); @@ -258,7 +276,9 @@ simde_vadd_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -286,7 +306,9 @@ simde_vadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -314,7 +336,10 @@ simde_vadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -342,7 +367,9 @@ simde_vadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -370,7 +397,9 @@ simde_vadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -397,15 +426,21 @@ simde_vaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfadd_vv_f16m1(a_.sv128, b_.sv128, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vaddq_f16 #define vaddq_f16(a, b) simde_vaddq_f16((a), (b)) #endif @@ -431,6 +466,8 @@ simde_vaddq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_add_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -465,6 +502,8 @@ simde_vaddq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_add_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -499,6 +538,8 @@ simde_vaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_add_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -533,6 +574,8 @@ simde_vaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_add_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -567,6 +610,8 @@ simde_vaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_add_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -601,6 +646,8 @@ simde_vaddq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_add_epi64(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -631,7 +678,9 @@ simde_vaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u8m1(a_.sv128, b_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -661,7 +710,9 @@ simde_vaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u16m1(a_.sv128, b_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -691,7 +742,9 @@ simde_vaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u32m1(a_.sv128, b_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -721,7 +774,9 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u64m1(a_.sv128, b_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -738,6 +793,182 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vaddq_u64(a, b) simde_vaddq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vadd_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vadd_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) + #undef vadd_p8 + #define vadd_p8(a, b) simde_vadd_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vadd_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vadd_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF); + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) + #undef vadd_p16 + #define vadd_p16(a, b) simde_vadd_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vadd_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vadd_p64(a, b); + #else + simde_poly64x1_private + r_, + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF); + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) + #undef vadd_p64 + #define vadd_p64(a, b) simde_vadd_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vaddq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vaddq_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) + #undef vaddq_p8 + #define vaddq_p8(a, b) simde_vaddq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vaddq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vaddq_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF); + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) + #undef vaddq_p16 + #define vaddq_p16(a, b) simde_vaddq_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vaddq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vaddq_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF); + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) + #undef vaddq_p64 + #define vaddq_p64(a, b) simde_vaddq_p64((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vaddq_p128(simde_poly128_t a, simde_poly128_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vaddq_p128(a, b); + #else + simde_poly128_t mask = 0xFFFFFFFFFFFFFFFFull; + mask = mask << 64; + mask = mask | 0xFFFFFFFFFFFFFFFFull; + return b ^ ((0 ^ a) & mask); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) + #undef vaddq_p128 + #define vaddq_p128(a, b) simde_vaddq_p128((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/addhn_high.h b/arm/neon/addhn_high.h new file mode 100644 index 000000000..0c96a24d4 --- /dev/null +++ b/arm/neon/addhn_high.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H) +#define SIMDE_ARM_NEON_ADDHN_HIGH_H + +#include "addhn.h" +#include "combine.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vaddhn_high_s16(simde_int8x8_t r, simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s16(r, a, b); + #else + return simde_vcombine_s8(r, simde_vaddhn_s16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s16 + #define vaddhn_high_s16(r, a, b) simde_vaddhn_high_s16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vaddhn_high_s32(simde_int16x4_t r, simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s32(r, a, b); + #else + return simde_vcombine_s16(r, simde_vaddhn_s32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s32 + #define vaddhn_high_s32(r, a, b) simde_vaddhn_high_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vaddhn_high_s64(simde_int32x2_t r, simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s64(r, a, b); + #else + return simde_vcombine_s32(r, simde_vaddhn_s64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s64 + #define vaddhn_high_s64(r, a, b) simde_vaddhn_high_s64((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaddhn_high_u16(simde_uint8x8_t r, simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u16(r, a, b); + #else + return simde_vcombine_u8(r, simde_vaddhn_u16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u16 + #define vaddhn_high_u16(r, a, b) simde_vaddhn_high_u16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vaddhn_high_u32(simde_uint16x4_t r, simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u32(r, a, b); + #else + return simde_vcombine_u16(r, simde_vaddhn_u32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u32 + #define vaddhn_high_u32(r, a, b) simde_vaddhn_high_u32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vaddhn_high_u64(simde_uint32x2_t r, simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u64(r, a, b); + #else + return simde_vcombine_u32(r, simde_vaddhn_u64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u64 + #define vaddhn_high_u64(r, a, b) simde_vaddhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H) */ diff --git a/arm/neon/addl.h b/arm/neon/addl.h index 539e91e47..cdabc5802 100644 --- a/arm/neon/addl.h +++ b/arm/neon/addl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDL_H) @@ -42,6 +43,13 @@ simde_int16x8_t simde_vaddl_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b)); #endif @@ -56,6 +64,13 @@ simde_int32x4_t simde_vaddl_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b)); #endif @@ -70,6 +85,13 @@ simde_int64x2_t simde_vaddl_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + return simde_int64x2_from_private(r_); #else return simde_vaddq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b)); #endif @@ -84,6 +106,13 @@ simde_uint16x8_t simde_vaddl_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b)); #endif @@ -98,6 +127,13 @@ simde_uint32x4_t simde_vaddl_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b)); #endif @@ -112,6 +148,13 @@ simde_uint64x2_t simde_vaddl_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4); + return simde_uint64x2_from_private(r_); #else return simde_vaddq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b)); #endif diff --git a/arm/neon/addl_high.h b/arm/neon/addl_high.h index fdef796c9..cf229823e 100644 --- a/arm/neon/addl_high.h +++ b/arm/neon/addl_high.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H) @@ -42,6 +43,15 @@ simde_int16x8_t simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); #endif @@ -56,6 +66,15 @@ simde_int32x4_t simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); #endif @@ -70,6 +89,15 @@ simde_int64x2_t simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); #endif @@ -84,6 +112,15 @@ simde_uint16x8_t simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); #endif @@ -98,6 +135,15 @@ simde_uint32x4_t simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); #endif @@ -112,6 +158,15 @@ simde_uint64x2_t simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); #endif diff --git a/arm/neon/addlv.h b/arm/neon/addlv.h index 79d9451b0..37be2c82e 100644 --- a/arm/neon/addlv.h +++ b/arm/neon/addlv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDLV_H) @@ -40,16 +41,22 @@ int16_t simde_vaddlv_s8(simde_int8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s16(simde_vmovl_s8(a)); #else simde_int8x8_private a_ = simde_int8x8_to_private(a); int16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vwredsum_vs_i8m1_i16m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -64,16 +71,22 @@ int32_t simde_vaddlv_s16(simde_int16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s32(simde_vmovl_s16(a)); #else simde_int16x4_private a_ = simde_int16x4_to_private(a); int32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vwredsum_vs_i16m1_i32m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -88,16 +101,22 @@ int64_t simde_vaddlv_s32(simde_int32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s64(simde_vmovl_s32(a)); #else simde_int32x2_private a_ = simde_int32x2_to_private(a); int64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vwredsum_vs_i32m1_i64m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -112,16 +131,22 @@ uint16_t simde_vaddlv_u8(simde_uint8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u16(simde_vmovl_u8(a)); #else simde_uint8x8_private a_ = simde_uint8x8_to_private(a); uint16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vwredsumu_vs_u8m1_u16m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_u16m1_u16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -136,16 +161,22 @@ uint32_t simde_vaddlv_u16(simde_uint16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u32(simde_vmovl_u16(a)); #else simde_uint16x4_private a_ = simde_uint16x4_to_private(a); uint32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vwredsumu_vs_u16m1_u32m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -160,16 +191,22 @@ uint64_t simde_vaddlv_u32(simde_uint32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u64(simde_vmovl_u32(a)); #else simde_uint32x2_private a_ = simde_uint32x2_to_private(a); uint64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vwredsumu_vs_u32m1_u64m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -184,14 +221,26 @@ int16_t simde_vaddlvq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_s8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_int8x16_to_m128i(a); + a_ = _mm_xor_si128(a_, _mm_set1_epi8('\x80')); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(int16_t, _mm_cvtsi128_si32(a_) - 2048); #else simde_int8x16_private a_ = simde_int8x16_to_private(a); int16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vwredsum_vs_i8m1_i16m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -206,14 +255,27 @@ int32_t simde_vaddlvq_s16(simde_int16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_s16(a); + #elif defined(SIMDE_X86_SSSE3_NATIVE) && !defined(HEDLEY_MSVC_VERSION) + __m128i a_ = simde_int16x8_to_m128i(a); + a_ = _mm_xor_si128(a_, _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, 0x8000))); + a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7)); + return _mm_cvtsi128_si32(a_) - 262144; #else simde_int16x8_private a_ = simde_int16x8_to_private(a); int32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vwredsum_vs_i16m1_i32m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -232,10 +294,16 @@ simde_vaddlvq_s32(simde_int32x4_t a) { simde_int32x4_private a_ = simde_int32x4_to_private(a); int64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vwredsum_vs_i32m1_i64m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -250,14 +318,25 @@ uint16_t simde_vaddlvq_u8(simde_uint8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_u8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_uint8x16_to_m128i(a); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(uint16_t, _mm_cvtsi128_si32(a_)); #else simde_uint8x16_private a_ = simde_uint8x16_to_private(a); uint16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vwredsumu_vs_u8m1_u16m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_u16m1_u16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -272,16 +351,28 @@ uint32_t simde_vaddlvq_u16(simde_uint16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_u16(a); + #elif defined(SIMDE_X86_SSSE3_NATIVE) + __m128i a_ = simde_uint16x8_to_m128i(a); + a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7)); + return HEDLEY_STATIC_CAST(uint32_t, _mm_cvtsi128_si32(a_)); #else simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - uint32_t r = 0; - - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + uint32_t r = 0; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vwredsumu_vs_u16m1_u32m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; + #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -298,10 +389,16 @@ simde_vaddlvq_u32(simde_uint32x4_t a) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); uint64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vwredsumu_vs_u32m1_u64m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif diff --git a/arm/neon/addv.h b/arm/neon/addv.h index bcc082b34..00ed4c8e1 100644 --- a/arm/neon/addv.h +++ b/arm/neon/addv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDV_H) @@ -43,11 +44,17 @@ simde_vaddv_f32(simde_float32x2_t a) { #else simde_float32x2_private a_ = simde_float32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0 , 1); + vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(a_.sv64 , zero , 2); + r = __riscv_vfmv_f_s_f32m1_f32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -67,11 +74,17 @@ simde_vaddv_s8(simde_int8x8_t a) { #else simde_int8x8_private a_ = simde_int8x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t zero = __riscv_vmv_v_x_i8m1(0 , 1); + vint8m1_t sum = __riscv_vredsum_vs_i8m1_i8m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_i8m1_i8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -91,11 +104,17 @@ simde_vaddv_s16(simde_int16x4_t a) { #else simde_int16x4_private a_ = simde_int16x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vredsum_vs_i16m1_i16m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -115,11 +134,17 @@ simde_vaddv_s32(simde_int32x2_t a) { #else simde_int32x2_private a_ = simde_int32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vredsum_vs_i32m1_i32m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -139,11 +164,17 @@ simde_vaddv_u8(simde_uint8x8_t a) { #else simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0 , 1); + vuint8m1_t sum = __riscv_vredsum_vs_u8m1_u8m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_u8m1_u8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -163,11 +194,17 @@ simde_vaddv_u16(simde_uint16x4_t a) { #else simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vredsum_vs_u16m1_u16m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_u16m1_u16(sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -187,11 +224,17 @@ simde_vaddv_u32(simde_uint32x2_t a) { #else simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -211,11 +254,17 @@ simde_vaddvq_f32(simde_float32x4_t a) { #else simde_float32x4_private a_ = simde_float32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0 , 1); + vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(a_.sv128 , zero , 4); + r = __riscv_vfmv_f_s_f32m1_f32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -235,11 +284,17 @@ simde_vaddvq_f64(simde_float64x2_t a) { #else simde_float64x2_private a_ = simde_float64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1_t zero = __riscv_vfmv_v_f_f64m1(0 , 1); + vfloat64m1_t sum = __riscv_vfredosum_vs_f64m1_f64m1(a_.sv128 , zero , 2); + r = __riscv_vfmv_f_s_f64m1_f64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -259,11 +314,17 @@ simde_vaddvq_s8(simde_int8x16_t a) { #else simde_int8x16_private a_ = simde_int8x16_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t zero = __riscv_vmv_v_x_i8m1(0 , 1); + vint8m1_t sum = __riscv_vredsum_vs_i8m1_i8m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_i8m1_i8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -283,11 +344,17 @@ simde_vaddvq_s16(simde_int16x8_t a) { #else simde_int16x8_private a_ = simde_int16x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vredsum_vs_i16m1_i16m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -307,11 +374,17 @@ simde_vaddvq_s32(simde_int32x4_t a) { #else simde_int32x4_private a_ = simde_int32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vredsum_vs_i32m1_i32m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -331,11 +404,17 @@ simde_vaddvq_s64(simde_int64x2_t a) { #else simde_int64x2_private a_ = simde_int64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vredsum_vs_i64m1_i64m1(a_.sv128 , zero , 2); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -352,14 +431,25 @@ simde_vaddvq_u8(simde_uint8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r = vaddvq_u8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_uint8x16_to_m128i(a); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi8(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(uint8_t, _mm_cvtsi128_si32(a_)); #else simde_uint8x16_private a_ = simde_uint8x16_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0 , 1); + vuint8m1_t sum = __riscv_vredsum_vs_u8m1_u8m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_u8m1_u8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -379,11 +469,17 @@ simde_vaddvq_u16(simde_uint16x8_t a) { #else simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vredsum_vs_u16m1_u16m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_u16m1_u16(sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -403,11 +499,17 @@ simde_vaddvq_u32(simde_uint32x4_t a) { #else simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -427,11 +529,17 @@ simde_vaddvq_u64(simde_uint64x2_t a) { #else simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vredsum_vs_u64m1_u64m1(a_.sv128 , zero , 2); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; diff --git a/arm/neon/addw.h b/arm/neon/addw.h index ec736215f..f38b4d777 100644 --- a/arm/neon/addw.h +++ b/arm/neon/addw.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDW_H) @@ -41,14 +42,17 @@ simde_int16x8_t simde_vaddw_s8(simde_int16x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(a, simde_vmovl_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, vb, 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -71,14 +75,17 @@ simde_int32x4_t simde_vaddw_s16(simde_int32x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(a, simde_vmovl_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, vb, 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -101,14 +108,17 @@ simde_int64x2_t simde_vaddw_s32(simde_int64x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s64(a, simde_vmovl_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, vb, 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -131,14 +141,17 @@ simde_uint16x8_t simde_vaddw_u8(simde_uint16x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(a, simde_vmovl_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, vb, 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -161,14 +174,17 @@ simde_uint32x4_t simde_vaddw_u16(simde_uint32x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u32(a, simde_vmovl_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, vb, 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -191,14 +207,17 @@ simde_uint64x2_t simde_vaddw_u32(simde_uint64x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u64(a, simde_vmovl_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, vb, 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else diff --git a/arm/neon/addw_high.h b/arm/neon/addw_high.h index 1f2df9052..be293ac60 100644 --- a/arm/neon/addw_high.h +++ b/arm/neon/addw_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDW_HIGH_H) @@ -30,6 +31,7 @@ #include "types.h" #include "movl_high.h" #include "add.h" +#include "addw.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -40,17 +42,22 @@ simde_int16x8_t simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(a, simde_vmovl_high_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)); + r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, b_high, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int16x8_from_private(r_); #endif @@ -65,17 +72,22 @@ simde_int32x4_t simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(a, simde_vmovl_high_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t b_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)); + r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, b_high, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int32x4_from_private(r_); #endif @@ -90,18 +102,21 @@ simde_int64x2_t simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s64(a, simde_vmovl_high_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t b_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)); + r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, b_high, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int64x2_from_private(r_); #endif } @@ -115,18 +130,21 @@ simde_uint16x8_t simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(a, simde_vmovl_high_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t b_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)); + r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, b_high, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -140,18 +158,21 @@ simde_uint32x4_t simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u32(a, simde_vmovl_high_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t b_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)); + r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, b_high, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -165,18 +186,21 @@ simde_uint64x2_t simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u64(a, simde_vmovl_high_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t b_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)); + r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, b_high, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint64x2_from_private(r_); #endif } diff --git a/arm/neon/aes.h b/arm/neon/aes.h new file mode 100644 index 000000000..bfda94516 --- /dev/null +++ b/arm/neon/aes.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_AES_H) +#define SIMDE_ARM_NEON_AES_H + +#include "types.h" +#include "../../simde-aes.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +static uint8_t simde_xtime(uint8_t x) +{ + return HEDLEY_STATIC_CAST(uint8_t, (x<<1) ^ (((x>>7) & 1) * 0x1b)); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaeseq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaeseq_u8(data, key); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(data), + b_ = simde_uint8x16_to_private(key); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + // AESShiftRows + uint8_t tmp; + tmp = r_.values[1]; + r_.values[1] = r_.values[5]; + r_.values[5] = r_.values[9]; + r_.values[9] = r_.values[13]; + r_.values[13] = tmp; + + tmp = r_.values[2]; + r_.values[2] = r_.values[10]; + r_.values[10] = tmp; + + tmp = r_.values[6]; + r_.values[6] = r_.values[14]; + r_.values[14] = tmp; + + tmp = r_.values[3]; + r_.values[3] = r_.values[15]; + r_.values[15] = r_.values[11]; + r_.values[11] = r_.values[7]; + r_.values[7] = tmp; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_x_aes_s_box[r_.values[i]]; + } + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) + #undef vaeseq_u8 + #define vaeseq_u8(data, key) simde_vaeseq_u8((data), (key)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesdq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesdq_u8(data, key); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(data), + b_ = simde_uint8x16_to_private(key); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + // AESInvShiftRows + uint8_t tmp; + tmp = r_.values[13]; + r_.values[13] = r_.values[9]; + r_.values[9] = r_.values[5]; + r_.values[5] = r_.values[1]; + r_.values[1] = tmp; + + tmp = r_.values[2]; + r_.values[2] = r_.values[10]; + r_.values[10] = tmp; + + tmp = r_.values[6]; + r_.values[6] = r_.values[14]; + r_.values[14] = tmp; + + tmp = r_.values[3]; + r_.values[3] = r_.values[7]; + r_.values[7] = r_.values[11]; + r_.values[11] = r_.values[15]; + r_.values[15] = tmp; + for(int i = 0; i < 16; ++i) { + r_.values[i] = simde_x_aes_inv_s_box[r_.values[i]]; + } + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) + #undef vaesdq_u8 + #define vaesdq_u8(data, key) simde_vaesdq_u8((data), (key)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesmcq_u8(simde_uint8x16_t data) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesmcq_u8(data); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + a_ = simde_uint8x16_to_private(data); + uint8_t i; + uint8_t Tmp, Tm, t; + for (i = 0; i < 4; ++i) + { + t = a_.values[i*4+0]; + Tmp = a_.values[i*4+0] ^ a_.values[i*4+1] ^ a_.values[i*4+2] ^ a_.values[i*4+3] ; + Tm = a_.values[i*4+0] ^ a_.values[i*4+1] ; Tm = simde_xtime(Tm); a_.values[i*4+0] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+1] ^ a_.values[i*4+2] ; Tm = simde_xtime(Tm); a_.values[i*4+1] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+2] ^ a_.values[i*4+3] ; Tm = simde_xtime(Tm); a_.values[i*4+2] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+3] ^ t ; Tm = simde_xtime(Tm); a_.values[i*4+3] ^= Tm ^ Tmp ; + } + return simde_uint8x16_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) + #undef vaesmcq_u8 + #define vaesmcq_u8(data) simde_vaesmcq_u8((data)) +#endif + +static uint8_t Multiply(uint8_t x, uint8_t y) +{ + return (((y & 1) * x) ^ + ((y>>1 & 1) * simde_xtime(x)) ^ + ((y>>2 & 1) * simde_xtime(simde_xtime(x))) ^ + ((y>>3 & 1) * simde_xtime(simde_xtime(simde_xtime(x)))) ^ + ((y>>4 & 1) * simde_xtime(simde_xtime(simde_xtime(simde_xtime(x)))))); /* this last call to simde_xtime() can be omitted */ +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesimcq_u8(simde_uint8x16_t data) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesimcq_u8(data); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(data), + r_; + /* ref: simde/simde/x86/aes.h */ + #if defined(SIMDE_X86_AES_NATIVE) + r_.m128i = _mm_aesimc_si128(a_.m128i); + #else + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = a_.values[Nb*j+i]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + r_.values[Nb*j+i] = res[i]; + } + } + #endif + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) + #undef vaesimcq_u8 + #define vaesimcq_u8(data) simde_vaesimcq_u8((data)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_AES_H) */ diff --git a/arm/neon/and.h b/arm/neon/and.h index 381154228..185683d75 100644 --- a/arm/neon/and.h +++ b/arm/neon/and.h @@ -47,6 +47,8 @@ simde_vand_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -77,6 +79,8 @@ simde_vand_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -107,6 +111,8 @@ simde_vand_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -137,6 +143,8 @@ simde_vand_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -167,6 +175,8 @@ simde_vand_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -197,6 +207,8 @@ simde_vand_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -227,6 +239,8 @@ simde_vand_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -257,6 +271,8 @@ simde_vand_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -291,6 +307,8 @@ simde_vandq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -325,6 +343,8 @@ simde_vandq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -359,6 +379,8 @@ simde_vandq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -393,6 +415,8 @@ simde_vandq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -427,6 +451,8 @@ simde_vandq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -461,6 +487,8 @@ simde_vandq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -495,6 +523,8 @@ simde_vandq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -529,6 +559,8 @@ simde_vandq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else diff --git a/arm/neon/bcax.h b/arm/neon/bcax.h index 929d8f8d8..746d8d613 100644 --- a/arm/neon/bcax.h +++ b/arm/neon/bcax.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_BCAX_H) @@ -39,13 +40,22 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + r_.sv128 = __riscv_vxor_vv_u8m1(a_.sv128, __riscv_vand_vv_u8m1(b_.sv128 , \ + __riscv_vnot_v_u8m1(c_.sv128 , 16), 16), 16); + return simde_uint8x16_from_private(r_); #else return simde_veorq_u8(a, simde_vbicq_u8(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u8 #define vbcaxq_u8(a, b, c) simde_vbcaxq_u8(a, b, c) #endif @@ -53,13 +63,22 @@ simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + r_.sv128 = __riscv_vxor_vv_u16m1(a_.sv128, __riscv_vand_vv_u16m1(b_.sv128 , \ + __riscv_vnot_v_u16m1(c_.sv128 , 8), 8), 8); + return simde_uint16x8_from_private(r_); #else return simde_veorq_u16(a, simde_vbicq_u16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u16 #define vbcaxq_u16(a, b, c) simde_vbcaxq_u16(a, b, c) #endif @@ -67,13 +86,22 @@ simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.sv128 = __riscv_vxor_vv_u32m1(a_.sv128, __riscv_vand_vv_u32m1(b_.sv128 , \ + __riscv_vnot_v_u32m1(c_.sv128 , 4), 4), 4); + return simde_uint32x4_from_private(r_); #else return simde_veorq_u32(a, simde_vbicq_u32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u32 #define vbcaxq_u32(a, b, c) simde_vbcaxq_u32(a, b, c) #endif @@ -81,13 +109,22 @@ simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b), + c_ = simde_uint64x2_to_private(c); + r_.sv128 = __riscv_vxor_vv_u64m1(a_.sv128, __riscv_vand_vv_u64m1(b_.sv128 , \ + __riscv_vnot_v_u64m1(c_.sv128 , 2), 2), 2); + return simde_uint64x2_from_private(r_); #else return simde_veorq_u64(a, simde_vbicq_u64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u64 #define vbcaxq_u64(a, b, c) simde_vbcaxq_u64(a, b, c) #endif @@ -95,13 +132,22 @@ simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + r_.sv128 = __riscv_vxor_vv_i8m1(a_.sv128, __riscv_vand_vv_i8m1(b_.sv128 , \ + __riscv_vnot_v_i8m1(c_.sv128 , 16), 16), 16); + return simde_int8x16_from_private(r_); #else return simde_veorq_s8(a, simde_vbicq_s8(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s8 #define vbcaxq_s8(a, b, c) simde_vbcaxq_s8(a, b, c) #endif @@ -109,13 +155,22 @@ simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int16x8_t simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + r_.sv128 = __riscv_vxor_vv_i16m1(a_.sv128, __riscv_vand_vv_i16m1(b_.sv128 , \ + __riscv_vnot_v_i16m1(c_.sv128 , 8), 8), 8); + return simde_int16x8_from_private(r_); #else return simde_veorq_s16(a,simde_vbicq_s16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s16 #define vbcaxq_s16(a, b, c) simde_vbcaxq_s16(a, b, c) #endif @@ -123,13 +178,22 @@ simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + r_.sv128 = __riscv_vxor_vv_i32m1(a_.sv128, __riscv_vand_vv_i32m1(b_.sv128 , \ + __riscv_vnot_v_i32m1(c_.sv128 , 4), 4), 4); + return simde_int32x4_from_private(r_); #else return simde_veorq_s32(a, simde_vbicq_s32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s32 #define vbcaxq_s32(a, b, c) simde_vbcaxq_s32(a, b, c) #endif @@ -137,13 +201,22 @@ simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vbcaxq_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b), + c_ = simde_int64x2_to_private(c); + r_.sv128 = __riscv_vxor_vv_i64m1(a_.sv128, __riscv_vand_vv_i64m1(b_.sv128 , \ + __riscv_vnot_v_i64m1(c_.sv128 , 2), 2), 2); + return simde_int64x2_from_private(r_); #else return simde_veorq_s64(a, simde_vbicq_s64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s64 #define vbcaxq_s64(a, b, c) simde_vbcaxq_s64(a, b, c) #endif diff --git a/arm/neon/bic.h b/arm/neon/bic.h index 49cc7f396..88a68ae5f 100644 --- a/arm/neon/bic.h +++ b/arm/neon/bic.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_BIC_H) @@ -48,9 +49,13 @@ simde_vbic_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i8m1(a_.sv64 , __riscv_vnot_v_i8m1(b_.sv64 , 8) , 8); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int8x8_from_private(r_); @@ -75,9 +80,13 @@ simde_vbic_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i16m1(a_.sv64 , __riscv_vnot_v_i16m1(b_.sv64 , 4) , 4); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int16x4_from_private(r_); @@ -102,9 +111,13 @@ simde_vbic_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64 , __riscv_vnot_v_i32m1(b_.sv64 , 2) , 2); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int32x2_from_private(r_); @@ -129,9 +142,13 @@ simde_vbic_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i64m1(a_.sv64 , __riscv_vnot_v_i64m1(b_.sv64 , 1) , 1); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int64x1_from_private(r_); @@ -156,9 +173,13 @@ simde_vbic_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u8m1(a_.sv64 , __riscv_vnot_v_u8m1(b_.sv64 , 8) , 8); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint8x8_from_private(r_); @@ -183,9 +204,13 @@ simde_vbic_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u16m1(a_.sv64 , __riscv_vnot_v_u16m1(b_.sv64 , 4) , 4); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint16x4_from_private(r_); @@ -210,9 +235,13 @@ simde_vbic_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u32m1(a_.sv64 , __riscv_vnot_v_u32m1(b_.sv64 , 2) , 2); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint32x2_from_private(r_); @@ -237,9 +266,13 @@ simde_vbic_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u64m1(a_.sv64 , __riscv_vnot_v_u64m1(b_.sv64 , 1) , 1); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint64x1_from_private(r_); @@ -263,7 +296,9 @@ simde_vbicq_s8(simde_int8x16_t a, simde_int8x16_t b) { b_ = simde_int8x16_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vnot_v_i8m1(b_.sv128 , 16) , 16); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -294,7 +329,9 @@ simde_vbicq_s16(simde_int16x8_t a, simde_int16x8_t b) { b_ = simde_int16x8_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i16m1(a_.sv128 , __riscv_vnot_v_i16m1(b_.sv128 , 8) , 8); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -325,7 +362,9 @@ simde_vbicq_s32(simde_int32x4_t a, simde_int32x4_t b) { b_ = simde_int32x4_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i32m1(a_.sv128 , __riscv_vnot_v_i32m1(b_.sv128 , 4) , 4); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -356,7 +395,9 @@ simde_vbicq_s64(simde_int64x2_t a, simde_int64x2_t b) { b_ = simde_int64x2_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i64m1(a_.sv128 , __riscv_vnot_v_i64m1(b_.sv128 , 2) , 2); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -387,7 +428,9 @@ simde_vbicq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { b_ = simde_uint8x16_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u8m1(a_.sv128 , __riscv_vnot_v_u8m1(b_.sv128 , 16) , 16); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -418,7 +461,9 @@ simde_vbicq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { b_ = simde_uint16x8_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u16m1(a_.sv128 , __riscv_vnot_v_u16m1(b_.sv128 , 8) , 8); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -449,7 +494,9 @@ simde_vbicq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { b_ = simde_uint32x4_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u32m1(a_.sv128 , __riscv_vnot_v_u32m1(b_.sv128 , 4) , 4); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -480,7 +527,9 @@ simde_vbicq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { b_ = simde_uint64x2_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u64m1(a_.sv128 , __riscv_vnot_v_u64m1(b_.sv128 , 2) , 2); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); diff --git a/arm/neon/bsl.h b/arm/neon/bsl.h index 0fc4ff270..93ded55de 100644 --- a/arm/neon/bsl.h +++ b/arm/neon/bsl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_BSL_H) @@ -61,7 +62,8 @@ simde_vbsl_f16(simde_uint16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { return simde_vreinterpret_f16_u16(simde_uint16x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vbsl_f16 #define vbsl_f16(a, b, c) simde_vbsl_f16((a), (b), (c)) #endif @@ -380,7 +382,8 @@ simde_vbslq_f16(simde_uint16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { return simde_vreinterpretq_f16_u16(simde_uint16x8_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vbslq_f16 #define vbslq_f16(a, b, c) simde_vbslq_f16((a), (b), (c)) #endif @@ -755,6 +758,156 @@ simde_vbslq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { #define vbslq_u64(a, b, c) simde_vbslq_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vbsl_p8(simde_uint8x8_t a, simde_poly8x8_t b, simde_poly8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbsl_p8(a, b, c); + #else + simde_poly8x8_private + r_, + b_ = simde_poly8x8_to_private(b), + c_ = simde_poly8x8_to_private(c); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbsl_p8 + #define vbsl_p8(a, b, c) simde_vbsl_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vbsl_p16(simde_uint16x4_t a, simde_poly16x4_t b, simde_poly16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbsl_p16(a, b, c); + #else + simde_poly16x4_private + r_, + b_ = simde_poly16x4_to_private(b), + c_ = simde_poly16x4_to_private(c); + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbsl_p16 + #define vbsl_p16(a, b, c) simde_vbsl_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vbsl_p64(simde_uint64x1_t a, simde_poly64x1_t b, simde_poly64x1_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vbsl_p64(a, b, c); + #else + simde_poly64x1_private + r_, + b_ = simde_poly64x1_to_private(b), + c_ = simde_poly64x1_to_private(c); + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbsl_p64 + #define vbsl_p64(a, b, c) simde_vbsl_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vbslq_p8(simde_uint8x16_t a, simde_poly8x16_t b, simde_poly8x16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbslq_p8(a, b, c); + #else + simde_poly8x16_private + r_, + b_ = simde_poly8x16_to_private(b), + c_ = simde_poly8x16_to_private(c); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbslq_p8 + #define vbslq_p8(a, b, c) simde_vbslq_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vbslq_p16(simde_uint16x8_t a, simde_poly16x8_t b, simde_poly16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbslq_p16(a, b, c); + #else + simde_poly16x8_private + r_, + b_ = simde_poly16x8_to_private(b), + c_ = simde_poly16x8_to_private(c); + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbslq_p16 + #define vbslq_p16(a, b, c) simde_vbslq_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vbslq_p64(simde_uint64x2_t a, simde_poly64x2_t b, simde_poly64x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vbslq_p64(a, b, c); + #else + simde_poly64x2_private + r_, + b_ = simde_poly64x2_to_private(b), + c_ = simde_poly64x2_to_private(c); + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbslq_p64 + #define vbslq_p64(a, b, c) simde_vbslq_p64((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/cadd_rot270.h b/arm/neon/cadd_rot270.h new file mode 100644 index 000000000..cc2ca641f --- /dev/null +++ b/arm/neon/cadd_rot270.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CADD_ROT270_H) +#define SIMDE_ARM_NEON_CADD_ROT270_H + +#include "add.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot270_f16(a, b); + #else + simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 5, 0, 7, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]); + r_.values[2 * i + 1] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]); + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) + #undef vcadd_rot270_f16 + #define vcadd_rot270_f16(a, b) simde_vcadd_rot270_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot270_f16(a, b); + #else + simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]); + r_.values[2 * i + 1] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]); + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) + #undef vcaddq_rot270_f16 + #define vcaddq_rot270_f16(a, b) simde_vcaddq_rot270_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcadd_rot270_f32(a, b); + #else + simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfadd_vv_f32m1(op1, a_.sv64, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcadd_rot270_f32 + #define vcadd_rot270_f32(a, b) simde_vcadd_rot270_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcaddq_rot270_f32(a, b); + #else + simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {5, 0, 7, 2}; + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfadd_vv_f32m1(op1, a_.sv128, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcaddq_rot270_f32 + #define vcaddq_rot270_f32(a, b) simde_vcaddq_rot270_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcaddq_rot270_f64(a, b); + #else + simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint64_t idx1[2] = {3, 0}; + vfloat64m2_t b_tmp = __riscv_vlmul_ext_v_f64m1_f64m2 (b_.sv128); + vfloat64m1_t op1 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vrgather_vv_f64m2(__riscv_vslideup_vx_f64m2( \ + __riscv_vfneg_v_f64m2(b_tmp, 2), b_tmp, 2, 4), __riscv_vle64_v_u64m2(idx1, 2), 2)); + r_.sv128 = __riscv_vfadd_vv_f64m1(op1, a_.sv128, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcaddq_rot270_f64 + #define vcaddq_rot270_f64(a, b) simde_vcaddq_rot270_f64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT270_H) */ diff --git a/arm/neon/cadd_rot90.h b/arm/neon/cadd_rot90.h new file mode 100644 index 000000000..5e3a3cb34 --- /dev/null +++ b/arm/neon/cadd_rot90.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CADD_ROT90_H) +#define SIMDE_ARM_NEON_CADD_ROT90_H + +#include "add.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot90_f16(a, b); + #else + simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 4, 3, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]); + r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]); + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) + #undef vcadd_rot90_f16 + #define vcadd_rot90_f16(a, b) simde_vcadd_rot90_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot90_f16(a, b); + #else + simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]); + r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]); + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) + #undef vcaddq_rot90_f16 + #define vcaddq_rot90_f16(a, b) simde_vcaddq_rot90_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcadd_rot90_f32(a, b); + #else + simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfadd_vv_f32m1(op1, a_.sv64, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcadd_rot90_f32 + #define vcadd_rot90_f32(a, b) simde_vcadd_rot90_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcaddq_rot90_f32(a, b); + #else + simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 4, 3, 6}; + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfadd_vv_f32m1(op1, a_.sv128, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcaddq_rot90_f32 + #define vcaddq_rot90_f32(a, b) simde_vcaddq_rot90_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) + return vcaddq_rot90_f64(a, b); + #else + simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + uint64_t idx1[2] = {1, 2}; + vfloat64m2_t b_tmp = __riscv_vlmul_ext_v_f64m1_f64m2 (b_.sv128); + vfloat64m1_t op1 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vrgather_vv_f64m2(__riscv_vslideup_vx_f64m2( \ + __riscv_vfneg_v_f64m2(b_tmp, 2), b_tmp, 2, 4), __riscv_vle64_v_u64m2(idx1, 2), 2)); + r_.sv128 = __riscv_vfadd_vv_f64m1(op1, a_.sv128, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcaddq_rot90_f64 + #define vcaddq_rot90_f64(a, b) simde_vcaddq_rot90_f64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT90_H) */ diff --git a/arm/neon/cage.h b/arm/neon/cage.h index 5d47b8aa6..0d71025c5 100644 --- a/arm/neon/cage.h +++ b/arm/neon/cage.h @@ -47,7 +47,8 @@ simde_vcageh_f16(simde_float16_t a, simde_float16_t b) { return (simde_math_fabsf(a_) >= simde_math_fabsf(b_)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcageh_f16 #define vcageh_f16(a, b) simde_vcageh_f16((a), (b)) #endif @@ -99,7 +100,8 @@ simde_vcage_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcage_f16 #define vcage_f16(a, b) simde_vcage_f16((a), (b)) #endif @@ -150,7 +152,8 @@ simde_vcageq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcageq_f16 #define vcageq_f16(a, b) simde_vcageq_f16((a), (b)) #endif diff --git a/arm/neon/cagt.h b/arm/neon/cagt.h index 138512f88..bbe9db2c2 100644 --- a/arm/neon/cagt.h +++ b/arm/neon/cagt.h @@ -48,7 +48,8 @@ simde_vcagth_f16(simde_float16_t a, simde_float16_t b) { return (simde_math_fabsf(af) > simde_math_fabsf(bf)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagth_f16 #define vcagth_f16(a, b) simde_vcagth_f16((a), (b)) #endif @@ -99,7 +100,8 @@ simde_vcagt_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagt_f16 #define vcagt_f16(a, b) simde_vcagt_f16((a), (b)) #endif @@ -150,7 +152,8 @@ simde_vcagtq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagtq_f16 #define vcagtq_f16(a, b) simde_vcagtq_f16((a), (b)) #endif diff --git a/arm/neon/cale.h b/arm/neon/cale.h new file mode 100644 index 000000000..acf795c02 --- /dev/null +++ b/arm/neon/cale.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CALE_H) +#define SIMDE_ARM_NEON_CALE_H + +#include "cage.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcaleh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaleh_f16(a, b); + #else + return simde_vcageh_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcaleh_f16 + #define vcaleh_f16(a, b) simde_vcaleh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcales_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcales_f32(a, b); + #else + return simde_vcages_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcales_f32 + #define vcales_f32(a, b) simde_vcales_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcaled_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaled_f64(a, b); + #else + return simde_vcaged_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaled_f64 + #define vcaled_f64(a, b) simde_vcaled_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcale_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcale_f16(a, b); + #else + return simde_vcage_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcale_f16 + #define vcale_f16(a, b) simde_vcale_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcale_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcale_f32(a, b); + #else + return simde_vcage_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcale_f32 + #define vcale_f32(a, b) simde_vcale_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcale_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcale_f64(a, b); + #else + return simde_vcage_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcale_f64 + #define vcale_f64(a, b) simde_vcale_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcaleq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaleq_f16(a, b); + #else + return simde_vcageq_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcaleq_f16 + #define vcaleq_f16(a, b) simde_vcaleq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcaleq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcaleq_f32(a, b); + #else + return simde_vcageq_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcaleq_f32 + #define vcaleq_f32(a, b) simde_vcaleq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcaleq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaleq_f64(a, b); + #else + return simde_vcageq_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaleq_f64 + #define vcaleq_f64(a, b) simde_vcaleq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_cale_H) */ diff --git a/arm/neon/calt.h b/arm/neon/calt.h new file mode 100644 index 000000000..1ee960357 --- /dev/null +++ b/arm/neon/calt.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CALT_H) +#define SIMDE_ARM_NEON_CALT_H + +#include "cagt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcalth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcalth_f16(a, b); + #else + return simde_vcagth_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcalth_f16 + #define vcalth_f16(a, b) simde_vcalth_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcalts_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcalts_f32(a, b); + #else + return simde_vcagts_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcalts_f32 + #define vcalts_f32(a, b) simde_vcalts_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcaltd_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaltd_f64(a, b); + #else + return simde_vcagtd_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaltd_f64 + #define vcaltd_f64(a, b) simde_vcaltd_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcalt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcalt_f16(a, b); + #else + return simde_vcagt_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcalt_f16 + #define vcalt_f16(a, b) simde_vcalt_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcalt_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcalt_f32(a, b); + #else + return simde_vcagt_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcalt_f32 + #define vcalt_f32(a, b) simde_vcalt_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcalt_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcalt_f64(a, b); + #else + return simde_vcagt_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcalt_f64 + #define vcalt_f64(a, b) simde_vcalt_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcaltq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaltq_f16(a, b); + #else + return simde_vcagtq_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcaltq_f16 + #define vcaltq_f16(a, b) simde_vcaltq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcaltq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcaltq_f32(a, b); + #else + return simde_vcagtq_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcaltq_f32 + #define vcaltq_f32(a, b) simde_vcaltq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcaltq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaltq_f64(a, b); + #else + return simde_vcagtq_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaltq_f64 + #define vcaltq_f64(a, b) simde_vcaltq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CAGT_H) */ diff --git a/arm/neon/ceq.h b/arm/neon/ceq.h index e60a4bf79..829ef60cb 100644 --- a/arm/neon/ceq.h +++ b/arm/neon/ceq.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CEQ_H) @@ -42,7 +43,8 @@ simde_vceqh_f16(simde_float16_t a, simde_float16_t b) { return (simde_float16_to_float32(a) == simde_float16_to_float32(b)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqh_f16 #define vceqh_f16(a, b) simde_vceqh_f16((a), (b)) #endif @@ -121,7 +123,8 @@ simde_vceq_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceq_f16 #define vceq_f16(a, b) simde_vceq_f16((a), (b)) #endif @@ -431,7 +434,8 @@ simde_vceqq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqq_f16 #define vceqq_f16(a, b) simde_vceqq_f16((a), (b)) #endif @@ -766,6 +770,102 @@ simde_vceqq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vceqq_u64(a, b) simde_vceqq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vceq_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vceq_p8(a, b); + #else + simde_uint8x8_private r_; + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0)); + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vceq_p8 + #define vceq_p8(a, b) simde_vceq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vceqq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vceqq_p8(a, b); + #else + simde_uint8x16_private r_; + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0)); + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vceqq_p8 + #define vceqq_p8(a, b) simde_vceqq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vceq_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vceq_p64(a, b); + #else + simde_uint64x1_private r_; + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vceq_p64 + #define vceq_p64(a, b) simde_vceq_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vceqq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vceqq_p64(a, b); + #else + simde_uint64x2_private r_; + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vceqq_p64 + #define vceqq_p64(a, b) simde_vceqq_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/ceqz.h b/arm/neon/ceqz.h index 176ecce0f..47d2ecaf7 100644 --- a/arm/neon/ceqz.h +++ b/arm/neon/ceqz.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CEQZ_H) @@ -46,7 +47,8 @@ simde_vceqz_f16(simde_float16x4_t a) { return simde_vceq_f16(a, simde_vdup_n_f16(SIMDE_FLOAT16_VALUE(0.0))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqz_f16 #define vceqz_f16(a) simde_vceqz_f16((a)) #endif @@ -200,7 +202,8 @@ simde_vceqzq_f16(simde_float16x8_t a) { return simde_vceqq_f16(a, simde_vdupq_n_f16(SIMDE_FLOAT16_VALUE(0.0))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqzq_f16 #define vceqzq_f16(a) simde_vceqzq_f16((a)) #endif @@ -375,14 +378,15 @@ simde_vceqzd_u64(uint64_t a) { SIMDE_FUNCTION_ATTRIBUTES uint16_t -simde_vceqzh_f16(simde_float16 a) { +simde_vceqzh_f16(simde_float16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vceqzh_f16(a); #else return simde_vceqh_f16(a, SIMDE_FLOAT16_VALUE(0.0)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqzh_f16 #define vceqzh_f16(a) simde_vceqzh_f16((a)) #endif @@ -415,6 +419,62 @@ simde_vceqzd_f64(simde_float64_t a) { #define vceqzd_f64(a) simde_vceqzd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vceqz_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqz_p8(a); + #else + return simde_vceq_p8(a, simde_vdup_n_p8(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqz_p8 + #define vceqz_p8(a) simde_vceqz_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vceqzq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqzq_p8(a); + #else + return simde_vceqq_p8(a, simde_vdupq_n_p8(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqzq_p8 + #define vceqzq_p8(a) simde_vceqzq_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vceqz_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqz_p64(a); + #else + return simde_vceq_p64(a, simde_vdup_n_p64(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqz_p64 + #define vceqz_p64(a) simde_vceqz_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vceqzq_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqzq_p64(a); + #else + return simde_vceqq_p64(a, simde_vdupq_n_p64(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqzq_p64 + #define vceqzq_p64(a) simde_vceqzq_p64((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/cge.h b/arm/neon/cge.h index 2ed6655a4..47759c395 100644 --- a/arm/neon/cge.h +++ b/arm/neon/cge.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CGE_H) @@ -43,7 +44,8 @@ simde_vcgeh_f16(simde_float16_t a, simde_float16_t b){ return (simde_float16_to_float32(a) >= simde_float16_to_float32(b)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgeh_f16 #define vcgeh_f16(a, b) simde_vcgeh_f16((a), (b)) #endif @@ -59,15 +61,22 @@ simde_vcgeq_f16(simde_float16x8_t a, simde_float16x8_t b) { b_ = simde_float16x8_to_private(b); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfge_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgeq_f16 #define vcgeq_f16(a, b) simde_vcgeq_f16((a), (b)) #endif @@ -85,10 +94,15 @@ simde_vcgeq_f32(simde_float32x4_t a, simde_float32x4_t b) { b_ = simde_float32x4_to_private(b); simde_uint32x4_private r_; + #if defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_castps_si128(_mm_cmpge_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfge_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -123,6 +137,10 @@ simde_vcgeq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmpge_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfge_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -157,6 +175,10 @@ simde_vcgeq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi8(a_.m128i, b_.m128i), _mm_cmpeq_epi8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsge_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -191,6 +213,10 @@ simde_vcgeq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(a_.m128i, b_.m128i), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsge_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -225,6 +251,10 @@ simde_vcgeq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(a_.m128i, b_.m128i), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsge_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -259,6 +289,10 @@ simde_vcgeq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(a_.m128i, b_.m128i), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsge_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -297,6 +331,10 @@ simde_vcgeq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgeu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -338,6 +376,10 @@ simde_vcgeq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgeu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -379,6 +421,10 @@ simde_vcgeq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgeu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -418,6 +464,10 @@ simde_vcgeq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #elif defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bits = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgeu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -446,15 +496,22 @@ simde_vcge_f16(simde_float16x4_t a, simde_float16x4_t b) { b_ = simde_float16x4_to_private(b); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfge_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, 0xffff, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcge_f16 #define vcge_f16(a, b) simde_vcge_f16((a), (b)) #endif @@ -470,7 +527,11 @@ simde_vcge_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfge_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -498,7 +559,11 @@ simde_vcge_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfge_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -528,6 +593,10 @@ simde_vcge_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(a_.m64, b_.m64), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsge_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -560,6 +629,10 @@ simde_vcge_s16(simde_int16x4_t a, simde_int16x4_t b) { r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(a_.m64, b_.m64), _mm_cmpeq_pi16(a_.m64, b_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsge_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_i16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_i16m1(r_.sv64, -1, result, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -590,6 +663,10 @@ simde_vcge_s32(simde_int32x2_t a, simde_int32x2_t b) { r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(a_.m64, b_.m64), _mm_cmpeq_pi32(a_.m64, b_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsge_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_i32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_i32m1(r_.sv64, -1, result, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -616,7 +693,11 @@ simde_vcge_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsge_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -647,6 +728,10 @@ simde_vcge_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgeu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -678,6 +763,10 @@ simde_vcge_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgeu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -709,6 +798,10 @@ simde_vcge_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgeu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -737,7 +830,11 @@ simde_vcge_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgeu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/cgez.h b/arm/neon/cgez.h index b84408361..5bf373302 100644 --- a/arm/neon/cgez.h +++ b/arm/neon/cgez.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CGEZ_H) @@ -78,6 +79,44 @@ simde_vcgezs_f32(simde_float32_t a) { #define vcgezs_f32(a) simde_vcgezs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgezh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgezh_f16(a)); + #else + return (simde_float16_to_float32(a) >= SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgezh_f16 + #define vcgezh_f16(a) simde_vcgezh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgezq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgezq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgezh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgezq_f16 + #define vcgezq_f16(a) simde_vcgezq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcgezq_f32(simde_float32x4_t a) { @@ -246,6 +285,29 @@ simde_vcgezq_s64(simde_int64x2_t a) { #define vcgezq_s64(a) simde_vcgezq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgez_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgez_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgezh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgez_f16 + #define vcgez_f16(a) simde_vcgez_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgez_f32(simde_float32x2_t a) { diff --git a/arm/neon/cgt.h b/arm/neon/cgt.h index a090dca5b..b48bf70cb 100644 --- a/arm/neon/cgt.h +++ b/arm/neon/cgt.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CGT_H) @@ -78,6 +80,24 @@ simde_vcgtd_u64(uint64_t a, uint64_t b) { #define vcgtd_u64(a, b) simde_vcgtd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgth_f16(a, b)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return (a_ > b_) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgth_f16 + #define vcgth_f16(a, b) simde_vcgth_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { @@ -92,6 +112,37 @@ simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { #define vcgts_f32(a, b) simde_vcgts_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfgt_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgtq_f16 + #define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -109,6 +160,10 @@ simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmpgt_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfgt_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -143,6 +198,10 @@ simde_vcgtq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmpgt_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfgt_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -177,6 +236,10 @@ simde_vcgtq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_cmpgt_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgt_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -211,6 +274,10 @@ simde_vcgtq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_cmpgt_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgt_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -245,6 +312,10 @@ simde_vcgtq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_cmpgt_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgt_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -284,6 +355,10 @@ simde_vcgtq_s64(simde_int64x2_t a, simde_int64x2_t b) { __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a_.m128i, b_.m128i), _mm_sub_epi64(b_.m128i, a_.m128i)); r = _mm_or_si128(r, _mm_cmpgt_epi32(a_.m128i, b_.m128i)); r_.m128i = _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgt_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -319,6 +394,10 @@ simde_vcgtq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgtu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -354,6 +433,10 @@ simde_vcgtq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgtu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -392,6 +475,10 @@ simde_vcgtq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgtu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -425,6 +512,10 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bit = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_cmpgt_epi64(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgtu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -442,6 +533,37 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcgtq_u64(a, b) simde_vcgtq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgt_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfgt_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgt_f16 + #define vcgt_f16(a, b) simde_vcgt_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -453,7 +575,11 @@ simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfgt_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -481,7 +607,11 @@ simde_vcgt_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfgt_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -511,6 +641,10 @@ simde_vcgt_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgt_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -541,6 +675,10 @@ simde_vcgt_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgt_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -571,6 +709,10 @@ simde_vcgt_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi32(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgt_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -599,7 +741,11 @@ simde_vcgt_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgt_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -630,6 +776,10 @@ simde_vcgt_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_cmpgt_pi8(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgtu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -661,6 +811,10 @@ simde_vcgt_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_cmpgt_pi16(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgtu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -692,6 +846,10 @@ simde_vcgt_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_cmpgt_pi32(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgtu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -720,7 +878,11 @@ simde_vcgt_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgtu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/cgtz.h b/arm/neon/cgtz.h index 125e009b2..55ed0b7eb 100644 --- a/arm/neon/cgtz.h +++ b/arm/neon/cgtz.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CGTZ_H) @@ -66,6 +67,44 @@ simde_vcgtzd_f64(simde_float64_t a) { #define vcgtzd_f64(a) simde_vcgtzd_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgtzh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgtzh_f16(a)); + #else + return (simde_float16_to_float32(a) > SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgtzh_f16 + #define vcgtzh_f16(a) simde_vcgtzh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgtzq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtzq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgtzh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgtzq_f16 + #define vcgtzq_f16(a) simde_vcgtzq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vcgtzs_f32(simde_float32_t a) { @@ -248,6 +287,29 @@ simde_vcgtzq_s64(simde_int64x2_t a) { #define vcgtzq_s64(a) simde_vcgtzq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgtz_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtz_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgtzh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcgtz_f16 + #define vcgtz_f16(a) simde_vcgtz_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgtz_f32(simde_float32x2_t a) { diff --git a/arm/neon/cle.h b/arm/neon/cle.h index 5a1591b30..6ca45d5f4 100644 --- a/arm/neon/cle.h +++ b/arm/neon/cle.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLE_H) @@ -90,6 +92,52 @@ simde_vcles_f32(simde_float32_t a, simde_float32_t b) { #define vcles_f32(a, b) simde_vcles_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcleh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcleh_f16(a, b)); + #else + return (simde_float16_to_float32(a) <= simde_float16_to_float32(b)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcleh_f16 + #define vcleh_f16(a, b) simde_vcleh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcleq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcleq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcleq_f16 + #define vcleq_f16(a, b) simde_vcleq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcleq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -107,6 +155,10 @@ simde_vcleq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmple_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -141,6 +193,10 @@ simde_vcleq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmple_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -175,6 +231,10 @@ simde_vcleq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi8(b_.m128i, a_.m128i), _mm_cmpeq_epi8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -209,6 +269,10 @@ simde_vcleq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(b_.m128i, a_.m128i), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -243,6 +307,10 @@ simde_vcleq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(b_.m128i, a_.m128i), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -277,6 +345,10 @@ simde_vcleq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(b_.m128i, a_.m128i), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -316,6 +388,10 @@ simde_vcleq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsleu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -364,6 +440,10 @@ simde_vcleq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsleu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -412,6 +492,10 @@ simde_vcleq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsleu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -458,6 +542,10 @@ simde_vcleq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { ), _mm_cmpeq_epi64(a_.m128i, b_.m128i) ); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsleu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -475,6 +563,37 @@ simde_vcleq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcleq_u64(a, b) simde_vcleq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcle_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcle_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcle_f16 + #define vcle_f16(a, b) simde_vcle_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcle_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -486,7 +605,11 @@ simde_vcle_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -514,7 +637,11 @@ simde_vcle_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -544,6 +671,10 @@ simde_vcle_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(b_.m64, a_.m64), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -574,6 +705,10 @@ simde_vcle_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(b_.m64, a_.m64), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -604,6 +739,10 @@ simde_vcle_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(b_.m64, a_.m64), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -632,7 +771,11 @@ simde_vcle_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -663,6 +806,10 @@ simde_vcle_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsleu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -694,6 +841,10 @@ simde_vcle_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsleu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -725,6 +876,10 @@ simde_vcle_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsleu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -753,7 +908,11 @@ simde_vcle_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsleu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/clez.h b/arm/neon/clez.h index ae3eea9b8..9c30a9f33 100644 --- a/arm/neon/clez.h +++ b/arm/neon/clez.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLEZ_H) @@ -78,18 +80,68 @@ simde_vclezs_f32(simde_float32_t a) { #define vclezs_f32(a) simde_vclezs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vclezh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vclezh_f16(a)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + + return (a_ <= 0.0f) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vclezh_f16 + #define vclezh_f16(a) simde_vclezh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vclezq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclezq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vf_f16m1_b16(a_.sv128, 0, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + #endif + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vclezq_f16 + #define vclezq_f16(a) simde_vclezq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vclezq_f32(simde_float32x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_f32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_f32(a, simde_vdupq_n_f32(SIMDE_FLOAT32_C(0.0))); #else simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_uint32x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vf_f32m1_b32(a_.sv128, 0, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT32_C(0.0)); #else SIMDE_VECTORIZE @@ -111,13 +163,17 @@ simde_uint64x2_t simde_vclezq_f64(simde_float64x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_f64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_f64(a, simde_vdupq_n_f64(SIMDE_FLOAT64_C(0.0))); #else simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_uint64x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vf_f64m1_b64(a_.sv128, 0, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT64_C(0.0)); #else SIMDE_VECTORIZE @@ -139,13 +195,17 @@ simde_uint8x16_t simde_vclezq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s8(a, simde_vdupq_n_s8(0)); #else simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_uint8x16_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vx_i8m1_b8(a_.sv128, 0, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -167,13 +227,17 @@ simde_uint16x8_t simde_vclezq_s16(simde_int16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s16(a, simde_vdupq_n_s16(0)); #else simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_uint16x8_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vx_i16m1_b16(a_.sv128, 0, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -195,13 +259,17 @@ simde_uint32x4_t simde_vclezq_s32(simde_int32x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s32(a, simde_vdupq_n_s32(0)); #else simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_uint32x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vx_i32m1_b32(a_.sv128, 0, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -223,13 +291,17 @@ simde_uint64x2_t simde_vclezq_s64(simde_int64x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s64(a, simde_vdupq_n_s64(0)); #else simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_uint64x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vx_i64m1_b64(a_.sv128, 0, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -246,18 +318,51 @@ simde_vclezq_s64(simde_int64x2_t a) { #define vclezq_s64(a) simde_vclezq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vclez_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclez_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vf_f16m1_b16(a_.sv64, 0, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + #endif + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vclez_f16 + #define vclez_f16(a) simde_vclez_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vclez_f32(simde_float32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_f32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_f32(a, simde_vdup_n_f32(SIMDE_FLOAT32_C(0.0))); #else simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vf_f32m1_b32(a_.sv64, 0, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT32_C(0.0)); #else SIMDE_VECTORIZE @@ -279,13 +384,17 @@ simde_uint64x1_t simde_vclez_f64(simde_float64x1_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_f64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_f64(a, simde_vdup_n_f64(SIMDE_FLOAT64_C(0.0))); #else simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vf_f64m1_b64(a_.sv64, 0, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT64_C(0.0)); #else SIMDE_VECTORIZE @@ -307,13 +416,17 @@ simde_uint8x8_t simde_vclez_s8(simde_int8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s8(a, simde_vdup_n_s8(0)); #else simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint8x8_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vx_i8m1_b8(a_.sv64, 0, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -335,13 +448,17 @@ simde_uint16x4_t simde_vclez_s16(simde_int16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s16(a, simde_vdup_n_s16(0)); #else simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint16x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vx_i16m1_b16(a_.sv64, 0, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -363,13 +480,17 @@ simde_uint32x2_t simde_vclez_s32(simde_int32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s32(a, simde_vdup_n_s32(0)); #else simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vx_i32m1_b32(a_.sv64, 0, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -391,13 +512,17 @@ simde_uint64x1_t simde_vclez_s64(simde_int64x1_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s64(a, simde_vdup_n_s64(0)); #else simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vx_i64m1_b64(a_.sv64, 0, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE diff --git a/arm/neon/clt.h b/arm/neon/clt.h index ae3602732..7a5f28161 100644 --- a/arm/neon/clt.h +++ b/arm/neon/clt.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLT_H) @@ -77,6 +79,24 @@ simde_vcltd_u64(uint64_t a, uint64_t b) { #define vcltd_u64(a, b) simde_vcltd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vclth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vclth_f16(a, b)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return (a_ < b_) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vclth_f16 + #define vclth_f16(a, b) simde_vclth_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vclts_f32(simde_float32_t a, simde_float32_t b) { @@ -91,6 +111,37 @@ simde_vclts_f32(simde_float32_t a, simde_float32_t b) { #define vclts_f32(a, b) simde_vclts_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmflt_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcltq_f16 + #define vcltq_f16(a, b) simde_vcltq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -108,6 +159,10 @@ simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmplt_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmflt_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -142,6 +197,10 @@ simde_vcltq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmplt_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmflt_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -176,6 +235,10 @@ simde_vcltq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_cmplt_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmslt_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -210,6 +273,10 @@ simde_vcltq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_cmplt_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmslt_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -244,6 +311,10 @@ simde_vcltq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_cmplt_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmslt_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -278,6 +349,10 @@ simde_vcltq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_cmpgt_epi64(b_.m128i, a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmslt_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -315,6 +390,10 @@ simde_vcltq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsltu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -355,6 +434,10 @@ simde_vcltq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_cmplt_epi16(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsltu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -395,6 +478,10 @@ simde_vcltq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_cmplt_epi32(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsltu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -433,6 +520,10 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #elif defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bits = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_cmpgt_epi64(_mm_xor_si128(b_.m128i, sign_bits), _mm_xor_si128(a_.m128i, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsltu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -450,6 +541,37 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcltq_u64(a, b) simde_vcltq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclt_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmflt_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + #endif + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vclt_f16 + #define vclt_f16(a, b) simde_vclt_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -461,7 +583,11 @@ simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmflt_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -489,7 +615,11 @@ simde_vclt_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmflt_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -519,6 +649,10 @@ simde_vclt_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi8(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmslt_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -549,6 +683,10 @@ simde_vclt_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi16(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmslt_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -579,6 +717,10 @@ simde_vclt_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi32(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmslt_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -607,7 +749,11 @@ simde_vclt_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmslt_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -638,6 +784,10 @@ simde_vclt_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_cmpgt_pi8(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsltu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -669,6 +819,10 @@ simde_vclt_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_cmpgt_pi16(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsltu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -700,6 +854,10 @@ simde_vclt_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_cmpgt_pi32(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsltu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -728,7 +886,11 @@ simde_vclt_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsltu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/cltz.h b/arm/neon/cltz.h index a9c94984e..a4d7f54f9 100644 --- a/arm/neon/cltz.h +++ b/arm/neon/cltz.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* TODO: float fallbacks should use vclt(a, vdup_n(0.0)) */ @@ -81,6 +82,44 @@ simde_vcltzs_f32(simde_float32_t a) { #define vcltzs_f32(a) simde_vcltzs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcltzh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcltzh_f16(a)); + #else + return (simde_float16_to_float32(a) < SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcltzh_f16 + #define vcltzh_f16(a) simde_vcltzh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcltz_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltz_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcltzh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcltz_f16 + #define vcltz_f16(a) simde_vcltz_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcltz_f32(simde_float32x2_t a) { @@ -201,6 +240,29 @@ simde_vcltz_s64(simde_int64x1_t a) { #define vcltz_s64(a) simde_vcltz_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcltzq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltzq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcltzh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcltzq_f16 + #define vcltzq_f16(a) simde_vcltzq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcltzq_f32(simde_float32x4_t a) { diff --git a/arm/neon/cmla.h b/arm/neon/cmla.h index 559e60703..558475533 100644 --- a/arm/neon/cmla.h +++ b/arm/neon/cmla.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_H) @@ -33,12 +34,50 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmla_f16 + #define vcmla_f16(r, a, b) simde_vcmla_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_f32(r, a, b); #else simde_float32x2_private @@ -59,17 +98,58 @@ simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_f32 #define vcmla_f32(r, a, b) simde_vcmla_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmlaq_f16 + #define vcmlaq_f16(r, a, b) simde_vcmlaq_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_f32(r, a, b); #else simde_float32x4_private @@ -77,7 +157,9 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2))); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; #else @@ -90,7 +172,10 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_f32 #define vcmlaq_f32(r, a, b) simde_vcmlaq_f32(r, a, b) #endif @@ -98,9 +183,10 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_f64(r, a, b); #else simde_float64x2_private @@ -108,7 +194,9 @@ simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0))); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0); r_.values += b_.values * a_.values; #else @@ -121,7 +209,10 @@ simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_f64 #define vcmlaq_f64(r, a, b) simde_vcmlaq_f64(r, a, b) #endif diff --git a/arm/neon/cmla_lane.h b/arm/neon/cmla_lane.h new file mode 100644 index 000000000..0415641e3 --- /dev/null +++ b/arm/neon/cmla_lane.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_LANE_H) +#define SIMDE_ARM_NEON_CMLA_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +#include "cmla.h" +#include "get_lane.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, b_.sv64, op1, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_lane_f16(r, a, b, lane) vcmla_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_lane_f16(r, a, b, lane) simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_lane_f16 + #define vcmla_lane_f16(r, a, b, lane) simde_vcmla_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[2] = {0, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, b_.sv64, op1, 2); + return simde_float32x2_from_private(r_); + #else + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_f32(r, a, b_tmp); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_lane_f32(r, a, b, lane) vcmla_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_lane_f32(r, a, b, lane) simde_vcmla_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_lane_f32 + #define vcmla_lane_f32(r, a, b, lane) simde_vcmla_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, b_.sv128, op1, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_lane_f16(r, a, b, lane) vcmlaq_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq0_lane_f16(r, a, b, lane) simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_lane_f16 + #define vcmlaq_lane_f16(r, a, b, lane) simde_vcmlaq_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, b_.sv128, op1, 4); + return simde_float32x4_from_private(r_); + #else + simde_float32x4_t b_tmp = simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_f32(r, a, b_tmp); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_lane_f32(r, a, b, lane) vcmlaq_lane_f32(r, a, b, 0) +#else + #define simde_vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_lane_f32 + #define vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, b_.sv64, op1, 4); + return simde_float16x4_from_private(r_); + #else + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_laneq_f16(r, a, b, lane) vcmla_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_laneq_f16 + #define vcmla_laneq_f16(r, a, b, lane) simde_vcmla_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, b_.sv64, op1, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_laneq_f32(r, a, b, lane) vcmla_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_laneq_f32 + #define vcmla_laneq_f32(r, a, b, lane) simde_vcmla_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, b_.sv128, op1, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_laneq_f16(r, a, b, lane) vcmlaq_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_laneq_f16 + #define vcmlaq_laneq_f16(r, a, b, lane) simde_vcmlaq_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, b_.sv128, op1, 4); + return simde_float32x4_from_private(r_); + #else + simde_float32x4_t b_tmp = simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_f32(r, a, b_tmp); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_laneq_f32(r, a, b, lane) vcmlaq_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_laneq_f32 + #define vcmlaq_laneq_f32(r, a, b, lane) simde_vcmlaq_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_LANE_H) */ diff --git a/arm/neon/cmla_rot180.h b/arm/neon/cmla_rot180.h index 5a5fa3f85..bfa7ee840 100644 --- a/arm/neon/cmla_rot180.h +++ b/arm/neon/cmla_rot180.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT180_H) @@ -33,12 +34,49 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_rot180_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmla_rot180_f16 + #define vcmla_rot180_f16(r, a, b) simde_vcmla_rot180_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) return vcmla_rot180_f32(r, a, b); #else simde_float32x2_private @@ -61,17 +99,57 @@ simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2 return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmla_rot180_f32 #define vcmla_rot180_f32(r, a, b) simde_vcmla_rot180_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + return vcmlaq_rot180_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_f16 + #define vcmlaq_rot180_f16(r, a, b) simde_vcmlaq_rot180_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot180_f32(r, a, b); #else simde_float32x4_private @@ -79,7 +157,11 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), wasm_f32x4_neg(b_.v128), 0, 1, 2, 3); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, -b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; @@ -94,7 +176,10 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmlaq_rot180_f32 #define vcmlaq_rot180_f32(r, a, b) simde_vcmlaq_rot180_f32(r, a, b) #endif @@ -102,9 +187,10 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot180_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +198,11 @@ simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), wasm_f64x2_neg(b_.v128), 0, 1); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; @@ -127,7 +217,10 @@ simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot180_f64 #define vcmlaq_rot180_f64(r, a, b) simde_vcmlaq_rot180_f64(r, a, b) #endif diff --git a/arm/neon/cmla_rot180_lane.h b/arm/neon/cmla_rot180_lane.h new file mode 100644 index 000000000..19f6b7fbb --- /dev/null +++ b/arm/neon/cmla_rot180_lane.h @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT180_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +#include "cmla_rot180.h" +#include "get_lane.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + uint16_t idx2[4] = {0, 1, 2, 3}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_lane_f16(r, a, b, lane) vcmla_rot180_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_lane_f16 + #define vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot180_f32(r, a, b_tmp); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_lane_f32(r, a, b, lane) vcmla_rot180_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_lane_f32 + #define vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) vcmlaq_rot180_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_lane_f16 + #define vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) vcmlaq_rot180_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_lane_f32 + #define vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + uint16_t idx2[4] = {0, 1, 2, 3}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_laneq_f16(r, a, b, lane) vcmla_rot180_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_laneq_f16 + #define vcmla_rot180_laneq_f16(r, a, b, lane) simde_vcmla_rot180_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot180_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_laneq_f32(r, a, b, lane) vcmla_rot180_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_laneq_f32 + #define vcmla_rot180_laneq_f32(r, a, b, lane) simde_vcmla_rot180_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) vcmlaq_rot180_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_laneq_f16 + #define vcmlaq_rot180_laneq_f16(r, a, b, lane) simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) vcmlaq_rot180_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_laneq_f32 + #define vcmlaq_rot180_laneq_f32(r, a, b, lane) simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) */ diff --git a/arm/neon/cmla_rot270.h b/arm/neon/cmla_rot270.h index cb9835c1f..363f0b2ff 100644 --- a/arm/neon/cmla_rot270.h +++ b/arm/neon/cmla_rot270.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT270_H) @@ -33,10 +34,46 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot270_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + return vcmla_rot270_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_f16 + #define vcmla_rot270_f16(r, a, b) simde_vcmla_rot270_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmla_rot270_f32(r, a, b); @@ -61,15 +98,54 @@ simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2 return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmla_rot270_f32 #define vcmla_rot270_f32(r, a, b) simde_vcmla_rot270_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot270_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + return vcmlaq_rot270_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_f16 + #define vcmlaq_rot270_f16(r, a, b) simde_vcmlaq_rot270_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmlaq_rot270_f32(r, a, b); @@ -79,7 +155,11 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 5, 0, 7, 2); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; @@ -94,7 +174,10 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmlaq_rot270_f32 #define vcmlaq_rot270_f32(r, a, b) simde_vcmlaq_rot270_f32(r, a, b) #endif @@ -102,9 +185,10 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot270_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +196,11 @@ simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 3, 0); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; @@ -127,7 +215,10 @@ simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot270_f64 #define vcmlaq_rot270_f64(r, a, b) simde_vcmlaq_rot270_f64(r, a, b) #endif diff --git a/arm/neon/cmla_rot270_lane.h b/arm/neon/cmla_rot270_lane.h new file mode 100644 index 000000000..c11f00e4f --- /dev/null +++ b/arm/neon/cmla_rot270_lane.h @@ -0,0 +1,421 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT270_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +#include "cmla_rot270.h" +#include "get_lane.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_lane_f16(r, a, b, lane) vcmla_rot270_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_lane_f16 + #define vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot270_f32(r, a, b_tmp); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_lane_f32(r, a, b, lane) vcmla_rot270_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_lane_f32 + #define vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) vcmlaq_rot270_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_lane_f16 + #define vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {5, 0, 7, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) vcmlaq_rot270_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_lane_f32 + #define vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_laneq_f16(r, a, b, lane) vcmla_rot270_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_laneq_f16 + #define vcmla_rot270_laneq_f16(r, a, b, lane) simde_vcmla_rot270_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot270_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_laneq_f32(r, a, b, lane) vcmla_rot270_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_laneq_f32 + #define vcmla_rot270_laneq_f32(r, a, b, lane) simde_vcmla_rot270_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_high.values += b_.values * a_high.values; + r_low.values += b_.values * a_low.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) vcmlaq_rot270_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_laneq_f16 + #define vcmlaq_rot270_laneq_f16(r, a, b, lane) simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {5, 0, 7, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) vcmlaq_rot270_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_laneq_f32 + #define vcmlaq_rot270_laneq_f32(r, a, b, lane) simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) */ diff --git a/arm/neon/cmla_rot90.h b/arm/neon/cmla_rot90.h index f4ebd13df..73cf3d34b 100644 --- a/arm/neon/cmla_rot90.h +++ b/arm/neon/cmla_rot90.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT90_H) @@ -33,12 +34,50 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot90_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_rot90_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmla_rot90_f16 + #define vcmla_rot90_f16(r, a, b) simde_vcmla_rot90_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_rot90_f32(r, a, b); #else simde_float32x2_private @@ -61,17 +100,58 @@ simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_ return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_rot90_f32 #define vcmla_rot90_f32(r, a, b) simde_vcmla_rot90_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot90_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_rot90_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmlaq_rot90_f16 + #define vcmlaq_rot90_f16(r, a, b) simde_vcmlaq_rot90_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot90_f32(r, a, b); #else simde_float32x4_private @@ -79,7 +159,11 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 1, 4, 3, 6); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; @@ -94,7 +178,10 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot90_f32 #define vcmlaq_rot90_f32(r, a, b) simde_vcmlaq_rot90_f32(r, a, b) #endif @@ -102,9 +189,10 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot90_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +200,11 @@ simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2 a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 1, 2); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; @@ -127,7 +219,10 @@ simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2 return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot90_f64 #define vcmlaq_rot90_f64(r, a, b) simde_vcmlaq_rot90_f64(r, a, b) #endif diff --git a/arm/neon/cmla_rot90_lane.h b/arm/neon/cmla_rot90_lane.h new file mode 100644 index 000000000..bb530eaca --- /dev/null +++ b/arm/neon/cmla_rot90_lane.h @@ -0,0 +1,420 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT90_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +#include "cmla_rot90.h" +#include "get_lane.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_lane_f16(r, a, b, lane) vcmla_rot90_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_lane_f16 + #define vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot90_f32(r, a, b_tmp); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_lane_f32(r, a, b, lane) vcmla_rot90_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_lane_f32 + #define vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) vcmlaq_rot90_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_lane_f16 + #define vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) vcmlaq_rot90_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_lane_f32 + #define vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_laneq_f16(r, a, b, lane) vcmla_rot90_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_laneq_f16 + #define vcmla_rot90_laneq_f16(r, a, b, lane) simde_vcmla_rot90_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); + #else + return simde_vcmla_rot90_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_laneq_f32(r, a, b, lane) vcmla_rot90_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_laneq_f32 + #define vcmla_rot90_laneq_f32(r, a, b, lane) simde_vcmla_rot90_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE > 128) + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) vcmlaq_rot90_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_laneq_f16 + #define vcmlaq_rot90_laneq_f16(r, a, b, lane) simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); + #else + return simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) vcmlaq_rot90_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_laneq_f32 + #define vcmlaq_rot90_laneq_f32(r, a, b, lane) simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) */ diff --git a/arm/neon/cnt.h b/arm/neon/cnt.h index e1fda38e7..faeaf51c4 100644 --- a/arm/neon/cnt.h +++ b/arm/neon/cnt.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CNT_H) @@ -54,10 +56,24 @@ simde_vcnt_s8(simde_int8x8_t a) { r_, a_ = simde_int8x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, simde_x_arm_neon_cntb(HEDLEY_STATIC_CAST(uint8_t, a_.values[i]))); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t p = __riscv_vreinterpret_v_i8m1_u8m1(a_.sv64); + vuint8m1_t tmp = __riscv_vand_vv_u8m1(__riscv_vsrl_vx_u8m1(p , 1 , 8) , __riscv_vmv_v_x_u8m1(0x55 , 8) , 8); + p = __riscv_vsub_vv_u8m1(p , tmp , 8); + tmp = p; + p = __riscv_vand_vv_u8m1(p , __riscv_vmv_v_x_u8m1(0x33 , 8) , 8); + tmp = __riscv_vand_vv_u8m1(__riscv_vsrl_vx_u8m1(tmp , 2 , 8) , __riscv_vmv_v_x_u8m1(0x33 , 8) , 8); + p = __riscv_vadd_vv_u8m1(p , tmp , 8); + tmp = __riscv_vsrl_vx_u8m1(p, 4 , 8); + p = __riscv_vadd_vv_u8m1(p , tmp , 8); + p = __riscv_vand_vv_u8m1(p , __riscv_vmv_v_x_u8m1(0xf , 8) , 8); + r_.sv64 = __riscv_vreinterpret_v_u8m1_i8m1(p); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, simde_x_arm_neon_cntb(HEDLEY_STATIC_CAST(uint8_t, a_.values[i]))); + } + #endif return simde_int8x8_from_private(r_); #endif @@ -139,6 +155,16 @@ simde_vcntq_s8(simde_int8x16_t a) { tmp = _mm_srli_epi16(a_.m128i, 4); a_.m128i = _mm_add_epi8(a_.m128i, tmp); r_.m128i = _mm_and_si128(a_.m128i, _mm_set1_epi8(0x0f)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t tmp = __riscv_vand_vv_i8m1(__riscv_vsra_vx_i8m1(a_.sv128 , 1 , 16) , __riscv_vmv_v_x_i8m1(0x55 , 16) , 16); + a_.sv128 = __riscv_vsub_vv_i8m1(a_.sv128 , tmp , 16); + tmp = a_.sv128; + a_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vmv_v_x_i8m1(0x33 , 16) , 16); + tmp = __riscv_vand_vv_i8m1(__riscv_vsra_vx_i8m1(tmp , 2 , 16) , __riscv_vmv_v_x_i8m1(0x33 , 16) , 16); + a_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128 , tmp , 16); + tmp = __riscv_vsra_vx_i8m1(a_.sv128, 4 , 16); + a_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128 , tmp , 16); + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vmv_v_x_i8m1(0xf , 16) , 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -164,6 +190,34 @@ simde_vcntq_u8(simde_uint8x16_t a) { #define vcntq_u8(a) simde_vcntq_u8((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcnt_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcnt_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vcnt_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcnt_p8 + #define vcnt_p8(a) simde_vcnt_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcntq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcntq_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vcntq_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcntq_p8 + #define vcntq_p8(a) simde_vcntq_p8((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/combine.h b/arm/neon/combine.h index 66c1df646..d54662fa4 100644 --- a/arm/neon/combine.h +++ b/arm/neon/combine.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_COMBINE_H) @@ -34,6 +36,35 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcombine_f16(simde_float16x4_t low, simde_float16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcombine_f16(low, high); + #else + simde_float16x8_private r_; + simde_float16x4_private + low_ = simde_float16x4_to_private(low), + high_ = simde_float16x4_to_private(high); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vslideup_vx_f16m1(low_.sv64, high_.sv64, 4, 8); + #else + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcombine_f16 + #define vcombine_f16(low, high) simde_vcombine_f16((low), (high)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcombine_f32(simde_float32x2_t low, simde_float32x2_t high) { @@ -48,7 +79,9 @@ simde_vcombine_f32(simde_float32x2_t low, simde_float32x2_t high) { /* Note: __builtin_shufflevector can have a the output contain * twice the number of elements, __builtin_shuffle cannot. * Using SIMDE_SHUFFLE_VECTOR_ here would not work. */ - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_f32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -78,7 +111,9 @@ simde_vcombine_f64(simde_float64x1_t low, simde_float64x1_t high) { low_ = simde_float64x1_to_private(low), high_ = simde_float64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_f64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -108,7 +143,9 @@ simde_vcombine_s8(simde_int8x8_t low, simde_int8x8_t high) { low_ = simde_int8x8_to_private(low), high_ = simde_int8x8_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i8m1(low_.sv64, high_.sv64, 8, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -138,7 +175,9 @@ simde_vcombine_s16(simde_int16x4_t low, simde_int16x4_t high) { low_ = simde_int16x4_to_private(low), high_ = simde_int16x4_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i16m1(low_.sv64, high_.sv64, 4, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -168,7 +207,9 @@ simde_vcombine_s32(simde_int32x2_t low, simde_int32x2_t high) { low_ = simde_int32x2_to_private(low), high_ = simde_int32x2_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -198,7 +239,9 @@ simde_vcombine_s64(simde_int64x1_t low, simde_int64x1_t high) { low_ = simde_int64x1_to_private(low), high_ = simde_int64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -228,7 +271,9 @@ simde_vcombine_u8(simde_uint8x8_t low, simde_uint8x8_t high) { low_ = simde_uint8x8_to_private(low), high_ = simde_uint8x8_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u8m1(low_.sv64, high_.sv64, 8, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -258,7 +303,9 @@ simde_vcombine_u16(simde_uint16x4_t low, simde_uint16x4_t high) { low_ = simde_uint16x4_to_private(low), high_ = simde_uint16x4_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u16m1(low_.sv64, high_.sv64, 4, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -288,7 +335,9 @@ simde_vcombine_u32(simde_uint32x2_t low, simde_uint32x2_t high) { low_ = simde_uint32x2_to_private(low), high_ = simde_uint32x2_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -318,7 +367,9 @@ simde_vcombine_u64(simde_uint64x1_t low, simde_uint64x1_t high) { low_ = simde_uint64x1_to_private(low), high_ = simde_uint64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -337,6 +388,111 @@ simde_vcombine_u64(simde_uint64x1_t low, simde_uint64x1_t high) { #define vcombine_u64(low, high) simde_vcombine_u64((low), (high)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcombine_p8(simde_poly8x8_t low, simde_poly8x8_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcombine_p8(low, high); + #else + simde_poly8x16_private r_; + simde_poly8x8_private + low_ = simde_poly8x8_to_private(low), + high_ = simde_poly8x8_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcombine_p8 + #define vcombine_p8(low, high) simde_vcombine_p8((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcombine_p16(simde_poly16x4_t low, simde_poly16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcombine_p16(low, high); + #else + simde_poly16x8_private r_; + simde_poly16x4_private + low_ = simde_poly16x4_to_private(low), + high_ = simde_poly16x4_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcombine_p16 + #define vcombine_p16(low, high) simde_vcombine_p16((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcombine_p64(simde_poly64x1_t low, simde_poly64x1_t high) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vcombine_p64(low, high); + #else + simde_poly64x2_private r_; + simde_poly64x1_private + low_ = simde_poly64x1_to_private(low), + high_ = simde_poly64x1_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcombine_p64 + #define vcombine_p64(low, high) simde_vcombine_p64((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcombine_bf16(simde_bfloat16x4_t low, simde_bfloat16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcombine_bf16(low, high); + #else + simde_bfloat16x8_private r_; + simde_bfloat16x4_private + low_ = simde_bfloat16x4_to_private(low), + high_ = simde_bfloat16x4_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcombine_bf16 + #define vcombine_bf16(low, high) simde_vcombine_bf16((low), (high)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/copy_lane.h b/arm/neon/copy_lane.h new file mode 100644 index 000000000..6a57c44fe --- /dev/null +++ b/arm/neon/copy_lane.h @@ -0,0 +1,1200 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_COPY_LANE_H) +#define SIMDE_ARM_NEON_COPY_LANE_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vcopy_lane_s8(simde_int8x8_t a, const int lane1, simde_int8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int8x8_private + b_ = simde_int8x8_to_private(b), + r_ = simde_int8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s8(a, lane1, b, lane2) vcopy_lane_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s8 + #define vcopy_lane_s8(a, lane1, b, lane2) simde_vcopy_lane_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vcopy_lane_s16(simde_int16x4_t a, const int lane1, simde_int16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int16x4_private + b_ = simde_int16x4_to_private(b), + r_ = simde_int16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s16(a, lane1, b, lane2) vcopy_lane_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s16 + #define vcopy_lane_s16(a, lane1, b, lane2) simde_vcopy_lane_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcopy_lane_s32(simde_int32x2_t a, const int lane1, simde_int32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int32x2_private + b_ = simde_int32x2_to_private(b), + r_ = simde_int32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s32(a, lane1, b, lane2) vcopy_lane_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s32 + #define vcopy_lane_s32(a, lane1, b, lane2) simde_vcopy_lane_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcopy_lane_s64(simde_int64x1_t a, const int lane1, simde_int64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_int64x1_private + b_ = simde_int64x1_to_private(b), + r_ = simde_int64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s64(a, lane1, b, lane2) vcopy_lane_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s64 + #define vcopy_lane_s64(a, lane1, b, lane2) simde_vcopy_lane_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vcopy_lane_u8(simde_uint8x8_t a, const int lane1, simde_uint8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint8x8_private + b_ = simde_uint8x8_to_private(b), + r_ = simde_uint8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u8(a, lane1, b, lane2) vcopy_lane_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u8 + #define vcopy_lane_u8(a, lane1, b, lane2) simde_vcopy_lane_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcopy_lane_u16(simde_uint16x4_t a, const int lane1, simde_uint16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint16x4_private + b_ = simde_uint16x4_to_private(b), + r_ = simde_uint16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u16(a, lane1, b, lane2) vcopy_lane_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u16 + #define vcopy_lane_u16(a, lane1, b, lane2) simde_vcopy_lane_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcopy_lane_u32(simde_uint32x2_t a, const int lane1, simde_uint32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint32x2_private + b_ = simde_uint32x2_to_private(b), + r_ = simde_uint32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u32(a, lane1, b, lane2) vcopy_lane_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u32 + #define vcopy_lane_u32(a, lane1, b, lane2) simde_vcopy_lane_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcopy_lane_u64(simde_uint64x1_t a, const int lane1, simde_uint64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_uint64x1_private + b_ = simde_uint64x1_to_private(b), + r_ = simde_uint64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u64(a, lane1, b, lane2) vcopy_lane_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u64 + #define vcopy_lane_u64(a, lane1, b, lane2) simde_vcopy_lane_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcopy_lane_f32(simde_float32x2_t a, const int lane1, simde_float32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float32x2_private + b_ = simde_float32x2_to_private(b), + r_ = simde_float32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_f32(a, lane1, b, lane2) vcopy_lane_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_f32 + #define vcopy_lane_f32(a, lane1, b, lane2) simde_vcopy_lane_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcopy_lane_f64(simde_float64x1_t a, const int lane1, simde_float64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_float64x1_private + b_ = simde_float64x1_to_private(b), + r_ = simde_float64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_f64(a, lane1, b, lane2) vcopy_lane_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_f64 + #define vcopy_lane_f64(a, lane1, b, lane2) simde_vcopy_lane_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vcopy_laneq_s8(simde_int8x8_t a, const int lane1, simde_int8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_int8x8_private + r_ = simde_int8x8_to_private(a); + simde_int8x16_private + b_ = simde_int8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s8(a, lane1, b, lane2) vcopy_laneq_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s8 + #define vcopy_laneq_s8(a, lane1, b, lane2) simde_vcopy_laneq_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vcopy_laneq_s16(simde_int16x4_t a, const int lane1, simde_int16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int16x4_private + r_ = simde_int16x4_to_private(a); + simde_int16x8_private + b_ = simde_int16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s16(a, lane1, b, lane2) vcopy_laneq_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s16 + #define vcopy_laneq_s16(a, lane1, b, lane2) simde_vcopy_laneq_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcopy_laneq_s32(simde_int32x2_t a, const int lane1, simde_int32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int32x2_private + r_ = simde_int32x2_to_private(a); + simde_int32x4_private + b_ = simde_int32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s32(a, lane1, b, lane2) vcopy_laneq_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s32 + #define vcopy_laneq_s32(a, lane1, b, lane2) simde_vcopy_laneq_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcopy_laneq_s64(simde_int64x1_t a, const int lane1, simde_int64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int64x1_private + r_ = simde_int64x1_to_private(a); + simde_int64x2_private + b_ = simde_int64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s64(a, lane1, b, lane2) vcopy_laneq_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s64 + #define vcopy_laneq_s64(a, lane1, b, lane2) simde_vcopy_laneq_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vcopy_laneq_u8(simde_uint8x8_t a, const int lane1, simde_uint8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_uint8x8_private + r_ = simde_uint8x8_to_private(a); + simde_uint8x16_private + b_ = simde_uint8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u8(a, lane1, b, lane2) vcopy_laneq_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u8 + #define vcopy_laneq_u8(a, lane1, b, lane2) simde_vcopy_laneq_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcopy_laneq_u16(simde_uint16x4_t a, const int lane1, simde_uint16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint16x4_private + r_ = simde_uint16x4_to_private(a); + simde_uint16x8_private + b_ = simde_uint16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u16(a, lane1, b, lane2) vcopy_laneq_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u16 + #define vcopy_laneq_u16(a, lane1, b, lane2) simde_vcopy_laneq_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcopy_laneq_u32(simde_uint32x2_t a, const int lane1, simde_uint32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint32x2_private + r_ = simde_uint32x2_to_private(a); + simde_uint32x4_private + b_ = simde_uint32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u32(a, lane1, b, lane2) vcopy_laneq_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u32 + #define vcopy_laneq_u32(a, lane1, b, lane2) simde_vcopy_laneq_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcopy_laneq_u64(simde_uint64x1_t a, const int lane1, simde_uint64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint64x1_private + r_ = simde_uint64x1_to_private(a); + simde_uint64x2_private + b_ = simde_uint64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u64(a, lane1, b, lane2) vcopy_laneq_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u64 + #define vcopy_laneq_u64(a, lane1, b, lane2) simde_vcopy_laneq_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcopy_laneq_f32(simde_float32x2_t a, const int lane1, simde_float32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_float32x2_private + r_ = simde_float32x2_to_private(a); + simde_float32x4_private + b_ = simde_float32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_f32(a, lane1, b, lane2) vcopy_laneq_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_f32 + #define vcopy_laneq_f32(a, lane1, b, lane2) simde_vcopy_laneq_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcopy_laneq_f64(simde_float64x1_t a, const int lane1, simde_float64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float64x1_private + r_ = simde_float64x1_to_private(a); + simde_float64x2_private + b_ = simde_float64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_f64(a, lane1, b, lane2) vcopy_laneq_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_f64 + #define vcopy_laneq_f64(a, lane1, b, lane2) simde_vcopy_laneq_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vcopyq_lane_s8(simde_int8x16_t a, const int lane1, simde_int8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int8x8_private + b_ = simde_int8x8_to_private(b); + simde_int8x16_private + r_ = simde_int8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s8(a, lane1, b, lane2) vcopyq_lane_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s8 + #define vcopyq_lane_s8(a, lane1, b, lane2) simde_vcopyq_lane_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vcopyq_lane_s16(simde_int16x8_t a, const int lane1, simde_int16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int16x4_private + b_ = simde_int16x4_to_private(b); + simde_int16x8_private + r_ = simde_int16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s16(a, lane1, b, lane2) vcopyq_lane_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s16 + #define vcopyq_lane_s16(a, lane1, b, lane2) simde_vcopyq_lane_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcopyq_lane_s32(simde_int32x4_t a, const int lane1, simde_int32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int32x2_private + b_ = simde_int32x2_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s32(a, lane1, b, lane2) vcopyq_lane_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s32 + #define vcopyq_lane_s32(a, lane1, b, lane2) simde_vcopyq_lane_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcopyq_lane_s64(simde_int64x2_t a, const int lane1, simde_int64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_int64x1_private + b_ = simde_int64x1_to_private(b); + simde_int64x2_private + r_ = simde_int64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s64(a, lane1, b, lane2) vcopyq_lane_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s64 + #define vcopyq_lane_s64(a, lane1, b, lane2) simde_vcopyq_lane_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vcopyq_lane_u8(simde_uint8x16_t a, const int lane1, simde_uint8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint8x8_private + b_ = simde_uint8x8_to_private(b); + simde_uint8x16_private + r_ = simde_uint8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u8(a, lane1, b, lane2) vcopyq_lane_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u8 + #define vcopyq_lane_u8(a, lane1, b, lane2) simde_vcopyq_lane_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcopyq_lane_u16(simde_uint16x8_t a, const int lane1, simde_uint16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint16x4_private + b_ = simde_uint16x4_to_private(b); + simde_uint16x8_private + r_ = simde_uint16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u16(a, lane1, b, lane2) vcopyq_lane_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u16 + #define vcopyq_lane_u16(a, lane1, b, lane2) simde_vcopyq_lane_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcopyq_lane_u32(simde_uint32x4_t a, const int lane1, simde_uint32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint32x2_private + b_ = simde_uint32x2_to_private(b); + simde_uint32x4_private + r_ = simde_uint32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u32(a, lane1, b, lane2) vcopyq_lane_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u32 + #define vcopyq_lane_u32(a, lane1, b, lane2) simde_vcopyq_lane_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcopyq_lane_u64(simde_uint64x2_t a, const int lane1, simde_uint64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_uint64x1_private + b_ = simde_uint64x1_to_private(b); + simde_uint64x2_private + r_ = simde_uint64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u64(a, lane1, b, lane2) vcopyq_lane_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u64 + #define vcopyq_lane_u64(a, lane1, b, lane2) simde_vcopyq_lane_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcopyq_lane_f32(simde_float32x4_t a, const int lane1, simde_float32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float32x2_private + b_ = simde_float32x2_to_private(b); + simde_float32x4_private + r_ = simde_float32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_f32(a, lane1, b, lane2) vcopyq_lane_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_f32 + #define vcopyq_lane_f32(a, lane1, b, lane2) simde_vcopyq_lane_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcopyq_lane_f64(simde_float64x2_t a, const int lane1, simde_float64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_float64x1_private + b_ = simde_float64x1_to_private(b); + simde_float64x2_private + r_ = simde_float64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_f64(a, lane1, b, lane2) vcopyq_lane_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_f64 + #define vcopyq_lane_f64(a, lane1, b, lane2) simde_vcopyq_lane_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vcopyq_laneq_s8(simde_int8x16_t a, const int lane1, simde_int8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_int8x16_private + b_ = simde_int8x16_to_private(b), + r_ = simde_int8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s8(a, lane1, b, lane2) vcopyq_laneq_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s8 + #define vcopyq_laneq_s8(a, lane1, b, lane2) simde_vcopyq_laneq_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vcopyq_laneq_s16(simde_int16x8_t a, const int lane1, simde_int16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int16x8_private + b_ = simde_int16x8_to_private(b), + r_ = simde_int16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s16(a, lane1, b, lane2) vcopyq_laneq_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s16 + #define vcopyq_laneq_s16(a, lane1, b, lane2) simde_vcopyq_laneq_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcopyq_laneq_s32(simde_int32x4_t a, const int lane1, simde_int32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int32x4_private + b_ = simde_int32x4_to_private(b), + r_ = simde_int32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s32(a, lane1, b, lane2) vcopyq_laneq_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s32 + #define vcopyq_laneq_s32(a, lane1, b, lane2) simde_vcopyq_laneq_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcopyq_laneq_s64(simde_int64x2_t a, const int lane1, simde_int64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int64x2_private + b_ = simde_int64x2_to_private(b), + r_ = simde_int64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s64(a, lane1, b, lane2) vcopyq_laneq_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s64 + #define vcopyq_laneq_s64(a, lane1, b, lane2) simde_vcopyq_laneq_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vcopyq_laneq_u8(simde_uint8x16_t a, const int lane1, simde_uint8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_uint8x16_private + b_ = simde_uint8x16_to_private(b), + r_ = simde_uint8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u8(a, lane1, b, lane2) vcopyq_laneq_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u8 + #define vcopyq_laneq_u8(a, lane1, b, lane2) simde_vcopyq_laneq_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcopyq_laneq_u16(simde_uint16x8_t a, const int lane1, simde_uint16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint16x8_private + b_ = simde_uint16x8_to_private(b), + r_ = simde_uint16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u16(a, lane1, b, lane2) vcopyq_laneq_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u16 + #define vcopyq_laneq_u16(a, lane1, b, lane2) simde_vcopyq_laneq_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcopyq_laneq_u32(simde_uint32x4_t a, const int lane1, simde_uint32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint32x4_private + b_ = simde_uint32x4_to_private(b), + r_ = simde_uint32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u32(a, lane1, b, lane2) vcopyq_laneq_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u32 + #define vcopyq_laneq_u32(a, lane1, b, lane2) simde_vcopyq_laneq_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcopyq_laneq_u64(simde_uint64x2_t a, const int lane1, simde_uint64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint64x2_private + b_ = simde_uint64x2_to_private(b), + r_ = simde_uint64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u64(a, lane1, b, lane2) vcopyq_laneq_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u64 + #define vcopyq_laneq_u64(a, lane1, b, lane2) simde_vcopyq_laneq_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcopyq_laneq_f32(simde_float32x4_t a, const int lane1, simde_float32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_float32x4_private + b_ = simde_float32x4_to_private(b), + r_ = simde_float32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_f32(a, lane1, b, lane2) vcopyq_laneq_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_f32 + #define vcopyq_laneq_f32(a, lane1, b, lane2) simde_vcopyq_laneq_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcopyq_laneq_f64(simde_float64x2_t a, const int lane1, simde_float64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float64x2_private + b_ = simde_float64x2_to_private(b), + r_ = simde_float64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_f64(a, lane1, b, lane2) vcopyq_laneq_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_f64 + #define vcopyq_laneq_f64(a, lane1, b, lane2) simde_vcopyq_laneq_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcopy_lane_p8(simde_poly8x8_t a, const int lane1, simde_poly8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly8x8_private + b_ = simde_poly8x8_to_private(b), + r_ = simde_poly8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p8(a, lane1, b, lane2) vcopy_lane_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_lane_p8 + #define vcopy_lane_p8(a, lane1, b, lane2) simde_vcopy_lane_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcopy_lane_p16(simde_poly16x4_t a, const int lane1, simde_poly16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_poly16x4_private + b_ = simde_poly16x4_to_private(b), + r_ = simde_poly16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p16(a, lane1, b, lane2) vcopy_lane_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_lane_p16 + #define vcopy_lane_p16(a, lane1, b, lane2) simde_vcopy_lane_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcopy_lane_p64(simde_poly64x1_t a, const int lane1, simde_poly64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_poly64x1_private + b_ = simde_poly64x1_to_private(b), + r_ = simde_poly64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p64(a, lane1, b, lane2) vcopy_lane_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_lane_p64 + #define vcopy_lane_p64(a, lane1, b, lane2) simde_vcopy_lane_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcopy_laneq_p8(simde_poly8x8_t a, const int lane1, simde_poly8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_poly8x8_private + r_ = simde_poly8x8_to_private(a); + simde_poly8x16_private + b_ = simde_poly8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p8(a, lane1, b, lane2) vcopy_laneq_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_laneq_p8 + #define vcopy_laneq_p8(a, lane1, b, lane2) simde_vcopy_laneq_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcopy_laneq_p16(simde_poly16x4_t a, const int lane1, simde_poly16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly16x4_private + r_ = simde_poly16x4_to_private(a); + simde_poly16x8_private + b_ = simde_poly16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p16(a, lane1, b, lane2) vcopy_laneq_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_laneq_p16 + #define vcopy_laneq_p16(a, lane1, b, lane2) simde_vcopy_laneq_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcopy_laneq_p64(simde_poly64x1_t a, const int lane1, simde_poly64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_poly64x1_private + r_ = simde_poly64x1_to_private(a); + simde_poly64x2_private + b_ = simde_poly64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p64(a, lane1, b, lane2) vcopy_laneq_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopy_laneq_p64 + #define vcopy_laneq_p64(a, lane1, b, lane2) simde_vcopy_laneq_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcopyq_lane_p8(simde_poly8x16_t a, const int lane1, simde_poly8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly8x8_private + b_ = simde_poly8x8_to_private(b); + simde_poly8x16_private + r_ = simde_poly8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p8(a, lane1, b, lane2) vcopyq_lane_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_lane_p8 + #define vcopyq_lane_p8(a, lane1, b, lane2) simde_vcopyq_lane_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcopyq_lane_p16(simde_poly16x8_t a, const int lane1, simde_poly16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_poly16x4_private + b_ = simde_poly16x4_to_private(b); + simde_poly16x8_private + r_ = simde_poly16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p16(a, lane1, b, lane2) vcopyq_lane_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_lane_p16 + #define vcopyq_lane_p16(a, lane1, b, lane2) simde_vcopyq_lane_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcopyq_lane_p64(simde_poly64x2_t a, const int lane1, simde_poly64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_poly64x1_private + b_ = simde_poly64x1_to_private(b); + simde_poly64x2_private + r_ = simde_poly64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p64(a, lane1, b, lane2) vcopyq_lane_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_lane_p64 + #define vcopyq_lane_p64(a, lane1, b, lane2) simde_vcopyq_lane_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcopyq_laneq_p8(simde_poly8x16_t a, const int lane1, simde_poly8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_poly8x16_private + b_ = simde_poly8x16_to_private(b), + r_ = simde_poly8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p8(a, lane1, b, lane2) vcopyq_laneq_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_laneq_p8 + #define vcopyq_laneq_p8(a, lane1, b, lane2) simde_vcopyq_laneq_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcopyq_laneq_p16(simde_poly16x8_t a, const int lane1, simde_poly16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly16x8_private + b_ = simde_poly16x8_to_private(b), + r_ = simde_poly16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p16(a, lane1, b, lane2) vcopyq_laneq_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_laneq_p16 + #define vcopyq_laneq_p16(a, lane1, b, lane2) simde_vcopyq_laneq_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcopyq_laneq_p64(simde_poly64x2_t a, const int lane1, simde_poly64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_poly64x2_private + b_ = simde_poly64x2_to_private(b), + r_ = simde_poly64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p64(a, lane1, b, lane2) vcopyq_laneq_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vcopyq_laneq_p64 + #define vcopyq_laneq_p64(a, lane1, b, lane2) simde_vcopyq_laneq_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcopy_lane_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_bfloat16x4_private + b_ = simde_bfloat16x4_to_private(b), + r_ = simde_bfloat16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopy_lane_bf16(a, lane1, b, lane2) vcopy_lane_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcopy_lane_bf16 + #define vcopy_lane_bf16(a, lane1, b, lane2) simde_vcopy_lane_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcopy_laneq_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(a); + simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopy_laneq_bf16(a, lane1, b, lane2) vcopy_laneq_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcopy_laneq_bf16 + #define vcopy_laneq_bf16(a, lane1, b, lane2) simde_vcopy_laneq_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcopyq_lane_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopyq_lane_bf16(a, lane1, b, lane2) vcopyq_lane_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcopyq_lane_bf16 + #define vcopyq_lane_bf16(a, lane1, b, lane2) simde_vcopyq_lane_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcopyq_laneq_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_bfloat16x8_private + b_ = simde_bfloat16x8_to_private(b), + r_ = simde_bfloat16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopyq_laneq_bf16(a, lane1, b, lane2) vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcopyq_laneq_bf16 + #define vcopyq_laneq_bf16(a, lane1, b, lane2) simde_vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_COPY_LANE_H */ diff --git a/arm/neon/crc32.h b/arm/neon/crc32.h new file mode 100644 index 000000000..50f8f1424 --- /dev/null +++ b/arm/neon/crc32.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CRC32_H) +#define SIMDE_ARM_NEON_CRC32_H + +#include "types.h" + +#if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(SIMDE_ARCH_ARM_CRC32)) + #include +#endif +// ^^ Due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70974 + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t simde_crc32_reverseBits(uint64_t num, int num_of_bits) +{ + uint64_t reverse_num = 0; + for (int i = 0; i < num_of_bits; i++) { + if (num & (1ULL << i)) + reverse_num |= 1ULL << (num_of_bits - 1 - i); + } + return reverse_num; +} + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t simde_crc32_eor_mask(uint32_t a, uint32_t b, uint32_t mask) { + uint32_t part_a = a & mask; + uint32_t part_result = part_a ^ b; + uint32_t result = (a & ~mask) | part_result; + return result; +} + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32b(uint32_t a, uint8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32b(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 24; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32b + #define __crc32b(a, b) simde___crc32b((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32h(uint32_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32h(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 16) << 16)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 16; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32h + #define __crc32h(a, b) simde___crc32h((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32w(uint32_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32w(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32w + #define __crc32w(a, b) simde___crc32w((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32d(uint32_t a, uint64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32d(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint64_t r_val = simde_crc32_reverseBits(b, 64); + uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32); + uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF); + uint32_t head = r_acc ^ val_head; + uint32_t mid = 0u ^ val_mid; + uint32_t tail = 0u; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF); + tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF); + } + } + for(int i = 31; i >= 0; --i) { + if ((mid>>i) & 1) { + mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32d + #define __crc32d(a, b) simde___crc32d((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cb(uint32_t a, uint8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32cb(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 24; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32cb + #define __crc32cb(a, b) simde___crc32cb((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32ch(uint32_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32ch(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 16) << 16); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 16; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32ch + #define __crc32ch(a, b) simde___crc32ch((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cw(uint32_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32cw(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32cw + #define __crc32cw(a, b) simde___crc32cw((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cd(uint32_t a, uint64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) + return __crc32cd(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint64_t r_val = simde_crc32_reverseBits(b, 64); + uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32); + uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF); + uint32_t head = r_acc ^ val_head; + uint32_t mid = 0u ^ val_mid; + uint32_t tail = 0u; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF); + tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF); + } + } + for(int i = 31; i >= 0; --i) { + if ((mid>>i) & 1) { + mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) + #undef __crc32cd + #define __crc32cd(a, b) simde___crc32cd((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CRC32_H) */ diff --git a/arm/neon/create.h b/arm/neon/create.h index 57f6f6eba..4ba48cf0d 100644 --- a/arm/neon/create.h +++ b/arm/neon/create.h @@ -23,12 +23,9 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ -/* N.B. CM: vcreate_f16 and vcreate_bf16 are omitted as - * SIMDe has no 16-bit floating point support. - * Idem for the poly types. */ - #if !defined(SIMDE_ARM_NEON_CREATE_H) #define SIMDE_ARM_NEON_CREATE_H @@ -152,6 +149,21 @@ simde_vcreate_u64(uint64_t a) { #define vcreate_u64(a) simde_vcreate_u64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcreate_f16(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcreate_f16(a); + #else + return simde_vreinterpret_f16_u64(simde_vdup_n_u64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcreate_f16 + #define vcreate_f16(a) simde_vcreate_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcreate_f32(uint64_t a) { @@ -180,6 +192,63 @@ simde_vcreate_f64(uint64_t a) { #define vcreate_f64(a) simde_vcreate_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcreate_p8(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcreate_p8(a); + #else + return simde_vreinterpret_p8_p64(simde_vdup_n_p64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcreate_p8 + #define vcreate_p8(a) simde_vcreate_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcreate_p16(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcreate_p16(a); + #else + return simde_vreinterpret_p16_p64(simde_vdup_n_p64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcreate_p16 + #define vcreate_p16(a) simde_vcreate_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcreate_p64(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vcreate_p64(a); + #else + return simde_vdup_n_p64(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcreate_p64 + #define vcreate_p64(a) simde_vcreate_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcreate_bf16(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcreate_bf16(a); + #else + return simde_vreinterpret_bf16_u64(simde_vdup_n_u64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcreate_bf16 + #define vcreate_bf16(a) simde_vcreate_bf16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/cvt.h b/arm/neon/cvt.h index 7a43bb5a9..96c39afe1 100644 --- a/arm/neon/cvt.h +++ b/arm/neon/cvt.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Sean Maher * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CVT_H) @@ -43,7 +44,9 @@ simde_vcvt_f16_f32(simde_float32x4_t a) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfncvt_f_f_w_f16m1(__riscv_vlmul_ext_v_f32m1_f32m2(a_.sv128), 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -55,7 +58,8 @@ simde_vcvt_f16_f32(simde_float32x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_f32 #define vcvt_f16_f32(a) simde_vcvt_f16_f32(a) #endif @@ -69,7 +73,9 @@ simde_vcvt_f32_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vfwcvt_f_f_v_f32m2(a_.sv64, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -81,7 +87,8 @@ simde_vcvt_f32_f16(simde_float16x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f32_f16 #define vcvt_f32_f16(a) simde_vcvt_f32_f16(a) #endif @@ -95,8 +102,10 @@ simde_vcvt_f32_f64(simde_float64x2_t a) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfncvt_f_f_w_f32m1(__riscv_vlmul_ext_v_f64m1_f64m2(a_.sv128), 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -121,8 +130,10 @@ simde_vcvt_f64_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_float64x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vfwcvt_f_f_v_f64m2(a_.sv64, 2)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -139,42 +150,139 @@ simde_vcvt_f64_f32(simde_float32x2_t a) { #endif SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_x_vcvts_s16_f16(simde_float16 a) { - #if defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_ARM_NEON_FP16) - return HEDLEY_STATIC_CAST(int16_t, a); +uint16_t +simde_vcvth_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_float16_to_float32(a)); #else simde_float32 af = simde_float16_to_float32(a); - if (HEDLEY_UNLIKELY(af < HEDLEY_STATIC_CAST(simde_float32, INT16_MIN))) { - return INT16_MIN; - } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, INT16_MAX))) { - return INT16_MAX; - } else if (HEDLEY_UNLIKELY(simde_math_isnanf(af))) { + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (simde_isnanhf(a)) { return 0; } else { - return HEDLEY_STATIC_CAST(int16_t, af); + return HEDLEY_STATIC_CAST(uint16_t, af); } #endif } +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_u16_f16 + #define vcvth_u16_f16(a) simde_vcvth_u16_f16(a) +#endif SIMDE_FUNCTION_ATTRIBUTES -uint16_t -simde_x_vcvts_u16_f16(simde_float16 a) { - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return HEDLEY_STATIC_CAST(uint16_t, simde_float16_to_float32(a)); +int32_t +simde_vcvth_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_float16_to_float32(a)); #else simde_float32 af = simde_float16_to_float32(a); - if (HEDLEY_UNLIKELY(af < SIMDE_FLOAT32_C(0.0))) { + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { return 0; - } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { - return UINT16_MAX; - } else if (simde_math_isnanf(af)) { + } else { + return HEDLEY_STATIC_CAST(int32_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_s32_f16 + #define vcvth_s32_f16(a) simde_vcvth_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvth_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { return 0; } else { - return HEDLEY_STATIC_CAST(uint16_t, af); + return HEDLEY_STATIC_CAST(uint32_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_u32_f16 + #define vcvth_u32_f16(a) simde_vcvth_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvth_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_s64_f16 + #define vcvth_s64_f16(a) simde_vcvth_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvth_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, af); } #endif } +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_u64_f16 + #define vcvth_u64_f16(a) simde_vcvth_u64_f16(a) +#endif SIMDE_FUNCTION_ATTRIBUTES int32_t @@ -219,7 +327,8 @@ simde_vcvts_u32_f32(simde_float32 a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvts_u32_f32 #define vcvts_u32_f32(a) simde_vcvts_u32_f32(a) #endif @@ -247,7 +356,8 @@ simde_vcvts_f32_u32 (uint32_t a) { return HEDLEY_STATIC_CAST(simde_float32, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvts_f32_u32 #define vcvts_f32_u32(a) simde_vcvts_f32_u32(a) #endif @@ -265,7 +375,7 @@ simde_vcvtd_s64_f64(simde_float64 a) { return INT64_MIN; } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { return INT64_MAX; - } else if (simde_math_isnanf(a)) { + } else if (simde_math_isnan(a)) { return 0; } else { return HEDLEY_STATIC_CAST(int64_t, a); @@ -296,7 +406,8 @@ simde_vcvtd_u64_f64(simde_float64 a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtd_u64_f64 #define vcvtd_u64_f64(a) simde_vcvtd_u64_f64(a) #endif @@ -324,35 +435,112 @@ simde_vcvtd_f64_u64(uint64_t a) { return HEDLEY_STATIC_CAST(simde_float64, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtd_f64_u64 #define vcvtd_f64_u64(a) simde_vcvtd_f64_u64(a) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_int16x4_t -simde_vcvt_s16_f16(simde_float16x4_t a) { +simde_float16_t +simde_vcvth_f16_u32(uint32_t a) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) - return vcvt_s16_f16(a); + return vcvth_f16_u32(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); #else - simde_float16x4_private a_ = simde_float16x4_to_private(a); - simde_int16x4_private r_; + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_u32 + #define vcvth_f16_u32(a) simde_vcvth_f16_u32(a) +#endif - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) - SIMDE_CONVERT_VECTOR_(r_.values, a_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]); - } - #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_u64(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_u64(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_u64 + #define vcvth_f16_u64(a) simde_vcvth_f16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s32(int32_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s32(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_s32 + #define vcvth_f16_s32(a) simde_vcvth_f16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s64(int64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s64(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_s64 + #define vcvth_f16_s64(a) simde_vcvth_f16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s16(int16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s16(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_s16 + #define vcvth_f16_s16(a) simde_vcvth_f16_s16(a) +#endif - return simde_int16x4_from_private(r_); +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_u16(uint16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_u16(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcvt_s16_f16 - #define vcvt_s16_f16(a) simde_vcvt_s16_f16(a) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_f16_u16 + #define vcvth_f16_u16(a) simde_vcvth_f16_u16(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -364,7 +552,17 @@ simde_vcvt_s32_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_int32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv64, 2); + #else + r_.sv64 = __riscv_vmerge_vxm_i32m1( + __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv64, 2), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2), + 2); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -390,19 +588,22 @@ simde_vcvt_u16_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u16m1(a_.sv64, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]); + r_.values[i] = simde_vcvth_u16_f16(a_.values[i]); } #endif return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_u16_f16 #define vcvt_u16_f16(a) simde_vcvt_u16_f16(a) #endif @@ -416,7 +617,17 @@ simde_vcvt_u32_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv64, 2); + #else + r_.sv64 = __riscv_vmerge_vxm_u32m1( + __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv64, 2), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2), + 2); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -428,7 +639,8 @@ simde_vcvt_u32_f32(simde_float32x2_t a) { return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_u32_f32 #define vcvt_u32_f32(a) simde_vcvt_u32_f32(a) #endif @@ -442,7 +654,17 @@ simde_vcvt_s64_f64(simde_float64x1_t a) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_int64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv64, 1); + #else + r_.sv64 = __riscv_vmerge_vxm_i64m1( + __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv64, 1), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1), + 1); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -468,7 +690,17 @@ simde_vcvt_u64_f64(simde_float64x1_t a) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv64, 1); + #else + r_.sv64 = __riscv_vmerge_vxm_u64m1( + __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv64, 1), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1), + 1); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (a_.values >= SIMDE_FLOAT64_C(0.0))); #else @@ -481,38 +713,12 @@ simde_vcvt_u64_f64(simde_float64x1_t a) { return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_u64_f64 #define vcvt_u64_f64(a) simde_vcvt_u64_f64(a) #endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_int16x8_t -simde_vcvtq_s16_f16(simde_float16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) - return vcvtq_s16_f16(a); - #else - simde_float16x8_private a_ = simde_float16x8_to_private(a); - simde_int16x8_private r_; - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) - SIMDE_CONVERT_VECTOR_(r_.values, a_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]); - } - #endif - - return simde_int16x8_from_private(r_); - #endif -} -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcvtq_s16_f16 - #define vcvtq_s16_f16(a) simde_vcvtq_s16_f16(a) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vcvtq_s32_f32(simde_float32x4_t a) { @@ -560,6 +766,16 @@ simde_vcvtq_s32_f32(simde_float32x4_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castps_si128(_mm_cmpord_ps(a_.m128, a_.m128))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv128, 4); + #else + r_.sv128 = __riscv_vmerge_vxm_i32m1( + __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv128, 4), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4), + 4); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -600,19 +816,22 @@ simde_vcvtq_u16_f16(simde_float16x8_t a) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]); + r_.values[i] = simde_vcvth_u16_f16(a_.values[i]); } #endif return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_u16_f16 #define vcvtq_u16_f16(a) simde_vcvtq_u16_f16(a) #endif @@ -657,6 +876,16 @@ simde_vcvtq_u32_f32(simde_float32x4_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castps_si128(_mm_cmpord_ps(a_.m128, a_.m128))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv128, 4); + #else + r_.sv128 = __riscv_vmerge_vxm_u32m1( + __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv128, 4), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4), + 4); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -679,7 +908,8 @@ simde_vcvtq_u32_f32(simde_float32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_u32_f32 #define vcvtq_u32_f32(a) simde_vcvtq_u32_f32(a) #endif @@ -737,6 +967,16 @@ simde_vcvtq_s64_f64(simde_float64x2_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castpd_si128(_mm_cmpord_pd(a_.m128d, a_.m128d))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv128, 2); + #else + r_.sv128 = __riscv_vmerge_vxm_i64m1( + __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv128, 2), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2), + 2); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -816,6 +1056,16 @@ simde_vcvtq_u64_f64(simde_float64x2_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castpd_si128(_mm_cmpord_pd(a_.m128d, a_.m128d))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv128, 2); + #else + r_.sv128 = __riscv_vmerge_vxm_u64m1( + __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv128, 2), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2), + 2); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); @@ -836,7 +1086,8 @@ simde_vcvtq_u64_f64(simde_float64x2_t a) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_u64_f64 #define vcvtq_u64_f64(a) simde_vcvtq_u64_f64(a) #endif @@ -850,7 +1101,9 @@ simde_vcvt_f16_s16(simde_int16x4_t a) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_f_x_v_f16m1(a_.sv64, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -866,7 +1119,8 @@ simde_vcvt_f16_s16(simde_int16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_s16 #define vcvt_f16_s16(a) simde_vcvt_f16_s16(a) #endif @@ -880,8 +1134,10 @@ simde_vcvt_f32_s32(simde_int32x2_t a) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_x_v_f32m1(a_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -906,19 +1162,24 @@ simde_vcvt_f16_u16(simde_uint16x4_t a) { simde_uint16x4_private a_ = simde_uint16x4_to_private(a); simde_float16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - #if SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI - r_.values[i] = HEDLEY_STATIC_CAST(simde_float16_t, a_.values[i]); - #else - r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a_.values[i])); - #endif - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_f_xu_v_f16m1(a_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + r_.values[i] = HEDLEY_STATIC_CAST(simde_float16_t, a_.values[i]); + #else + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a_.values[i])); + #endif + } + #endif return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_u16 #define vcvt_f16_u16(a) simde_vcvt_f16_u16(a) #endif @@ -932,8 +1193,10 @@ simde_vcvt_f32_u32(simde_uint32x2_t a) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_xu_v_f32m1(a_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -944,7 +1207,8 @@ simde_vcvt_f32_u32(simde_uint32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_f32_u32 #define vcvt_f32_u32(a) simde_vcvt_f32_u32(a) #endif @@ -958,8 +1222,10 @@ simde_vcvt_f64_s64(simde_int64x1_t a) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_float64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_x_v_f64m1(a_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -984,8 +1250,10 @@ simde_vcvt_f64_u64(simde_uint64x1_t a) { simde_uint64x1_private a_ = simde_uint64x1_to_private(a); simde_float64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_xu_v_f64m1(a_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -996,7 +1264,8 @@ simde_vcvt_f64_u64(simde_uint64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_f64_u64 #define vcvt_f64_u64(a) simde_vcvt_f64_u64(a) #endif @@ -1010,7 +1279,9 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_f_x_v_f16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1026,7 +1297,8 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_f16_s16 #define vcvtq_f16_s16(a) simde_vcvtq_f16_s16(a) #endif @@ -1040,8 +1312,10 @@ simde_vcvtq_f32_s32(simde_int32x4_t a) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_x_v_f32m1(a_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1066,7 +1340,9 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_f_xu_v_f16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1082,7 +1358,8 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_f16_u16 #define vcvtq_f16_u16(a) simde_vcvtq_f16_u16(a) #endif @@ -1096,8 +1373,10 @@ simde_vcvtq_f32_u32(simde_uint32x4_t a) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_xu_v_f32m1(a_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1108,7 +1387,8 @@ simde_vcvtq_f32_u32(simde_uint32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_f32_u32 #define vcvtq_f32_u32(a) simde_vcvtq_f32_u32(a) #endif @@ -1119,15 +1399,17 @@ simde_vcvtq_f64_s64(simde_int64x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vcvtq_f64_s64(a); #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - return vec_ctd(a, 0); + return vec_double(a); #else simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_float64x2_private r_; #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) r_.m128d = _mm_cvtepi64_pd(a_.m128i); - #elif defined(SIMDE_CONVERT_VECTOR_) + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_x_v_f64m1(a_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1152,8 +1434,10 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_float64x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_xu_v_f64m1(a_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1164,11 +1448,810 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_f64_u64 #define vcvtq_f64_u64(a) simde_vcvtq_f64_u64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtah_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtah_u16_f16 + #define vcvtah_u16_f16(a) simde_vcvtah_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtah_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtah_s32_f16 + #define vcvtah_s32_f16(a) simde_vcvtah_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtah_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtah_u32_f16 + #define vcvtah_u32_f16(a) simde_vcvtah_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtah_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtah_s64_f16 + #define vcvtah_s64_f16(a) simde_vcvtah_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtah_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtah_u64_f16 + #define vcvtah_u64_f16(a) simde_vcvtah_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtad_s64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtad_s64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a)); + #else + if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnan(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtad_s64_f64 + #define vcvtad_s64_f64(a) simde_vcvtad_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtad_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtad_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtad_u64_f64 + #define vcvtad_u64_f64(a) simde_vcvtad_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtas_s32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtas_s32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a)); + #else + if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtas_s32_f32 + #define vcvtas_s32_f32(a) simde_vcvtas_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtas_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtas_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + if (a < 0) return 0; + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtas_u32_f32 + #define vcvtas_u32_f32(a) simde_vcvtas_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvta_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvta_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvta_u16_f16 + #define vcvta_u16_f16(a) simde_vcvta_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvta_s64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_s64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_s64_f64 + #define vcvta_s64_f64(a) simde_vcvta_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvta_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_u64_f64 + #define vcvta_u64_f64(a) simde_vcvta_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvta_s32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_s32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_s32_f32 + #define vcvta_s32_f32(a) simde_vcvta_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtaq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtaq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtaq_u16_f16 + #define vcvtaq_u16_f16(a) simde_vcvtaq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtaq_s32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_s32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_s32_f32 + #define vcvtaq_s32_f32(a) simde_vcvtaq_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtaq_s64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_s64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]); + } + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_s64_f64 + #define vcvtaq_s64_f64(a) simde_vcvtaq_s64_f64(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtaq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_u64_f64 + #define vcvtaq_u64_f64(a) simde_vcvtaq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvta_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_u32_f32 + #define vcvta_u32_f32(a) simde_vcvta_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtaq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]); + } + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_u32_f32 + #define vcvtaq_u32_f32(a) simde_vcvtaq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvt_high_f16_f32(simde_float16x4_t r, simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f16_f32(r, a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(r); + simde_float16x8_private r_; + + size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + r_.values[i] = b_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(a_.values[i-half_pos]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_high_f16_f32 + #define vcvt_high_f16_f32(r, a) simde_vcvt_high_f16_f32((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f32_f64(r, a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private b_ = simde_float32x2_to_private(r); + simde_float32x4_private r_; + + size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + r_.values[i] = b_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32, a_.values[i-half_pos]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_high_f32_f64 + #define vcvt_high_f32_f64(r, a) simde_vcvt_high_f32_f64((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_high_f32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f32_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_float32x4_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < rsize; i++) { + r_.values[i] = simde_float16_to_float32(a_.values[i+rsize]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_high_f32_f16 + #define vcvt_high_f32_f16(a) simde_vcvt_high_f32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvt_high_f64_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvt_high_f64_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_float64x2_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64, a_.values[i+rsize]); + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_high_f64_f32 + #define vcvt_high_f64_f32(a) simde_vcvt_high_f64_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvtxd_f32_f64(simde_float64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtxd_f32_f64(a); + #else + return HEDLEY_STATIC_CAST(simde_float32_t, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtxd_f32_f64 + #define vcvtxd_f32_f64(a) simde_vcvtxd_f32_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvtx_f32_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtx_f32_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtxd_f32_f64(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtx_f32_f64 + #define vcvtx_f32_f64(a) simde_vcvtx_f32_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtx_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtx_high_f32_f64(r, a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_float32x4_private ret; + + size_t half_pos = (sizeof(ret.values) / sizeof(ret.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + ret.values[i] = r_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = simde_vcvtxd_f32_f64(a_.values[i-half_pos]); + } + + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtx_high_f32_f64 + #define vcvtx_high_f32_f64(r, a) simde_vcvtx_high_f32_f64((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcvt_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvt_bf16_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_from_float32(a_.values[i]); + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvt_bf16_f32 + #define vcvt_bf16_f32(a) simde_vcvt_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_f32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvt_f32_bf16(a); + #else + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvt_f32_bf16 + #define vcvt_f32_bf16(a) simde_vcvt_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvtah_f32_bf16(simde_bfloat16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtah_f32_bf16(a); + #else + return simde_bfloat16_to_float32(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvtah_f32_bf16 + #define vcvtah_f32_bf16(a) simde_vcvtah_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vcvth_bf16_f32(float a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvth_bf16_f32(a); + #else + return simde_bfloat16_from_float32(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvth_bf16_f32 + #define vcvth_bf16_f32(a) simde_vcvth_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_low_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_low_f32_bf16(a); + #else + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvtq_low_f32_bf16 + #define vcvtq_low_f32_bf16(a) simde_vcvtq_low_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_high_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_high_f32_bf16(a); + #else + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_float32x4_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i + rsize]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvtq_high_f32_bf16 + #define vcvtq_high_f32_bf16(a) simde_vcvtq_high_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcvtq_low_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_low_bf16_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x8_private r_; + + size_t asize = (sizeof(a_.values) / sizeof(a_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < asize; i++) { + r_.values[i] = simde_bfloat16_from_float32(a_.values[i]); + r_.values[i + asize] = SIMDE_BFLOAT16_VALUE(0.0); + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvtq_low_bf16_f32 + #define vcvtq_low_bf16_f32(a) simde_vcvtq_low_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcvtq_high_bf16_f32(simde_bfloat16x8_t inactive, simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_high_bf16_f32(inactive, a); + #else + simde_bfloat16x8_private inactive_ = simde_bfloat16x8_to_private(inactive); + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x8_private r_; + + size_t asize = (sizeof(a_.values) / sizeof(a_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r_.values[i] = inactive_.values[i]; + r_.values[i + asize] = simde_bfloat16_from_float32(a_.values[i]); + } + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vcvtq_high_bf16_f32 + #define vcvtq_high_bf16_f32(inactive, a) simde_vcvtq_high_bf16_f32((inactive), (a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/cvt_n.h b/arm/neon/cvt_n.h new file mode 100644 index 000000000..3574a3f6c --- /dev/null +++ b/arm/neon/cvt_n.h @@ -0,0 +1,703 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVT_N_H) +#define SIMDE_ARM_NEON_CVT_N_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvth_n_u16_f16(simde_float16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_vcvth_u16_f16( + simde_float16_from_float32( + simde_float16_to_float32(a) * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_u16_f16(a, n) vcvth_n_u16_f16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_n_u16_f16 + #define vcvth_n_u16_f16(a, n) simde_vcvth_n_u16_f16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_n_f16_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_float16_from_float32( + HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_f16_s16(a, n) vcvth_n_f16_s16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_n_f16_s16 + #define vcvth_n_f16_s16(a, n) simde_vcvth_n_f16_s16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_n_f16_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_float16_from_float32( + HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_f16_u16(a, n) vcvth_n_f16_u16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvth_n_f16_u16 + #define vcvth_n_f16_u16(a, n) simde_vcvth_n_f16_u16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvts_n_s32_f32(simde_float32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return simde_vcvts_s32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_s32_f32(a, n) vcvts_n_s32_f32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_s32_f32 + #define vcvts_n_s32_f32(a, n) simde_vcvts_n_s32_f32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvts_n_u32_f32(simde_float32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return simde_vcvts_u32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_u32_f32(a, n) vcvts_n_u32_f32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_u32_f32 + #define vcvts_n_u32_f32(a, n) simde_vcvts_n_u32_f32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvts_n_f32_s32(int32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_f32_s32(a, n) vcvts_n_f32_s32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_f32_s32 + #define vcvts_n_f32_s32(a, n) simde_vcvts_n_f32_s32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvts_n_f32_u32(uint32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_f32_u32(a, n) vcvts_n_f32_u32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_f32_u32 + #define vcvts_n_f32_u32(a, n) simde_vcvts_n_f32_u32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtd_n_s64_f64(simde_float64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return simde_vcvtd_s64_f64(a * simde_math_pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_s64_f64(a, n) vcvtd_n_s64_f64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_s64_f64 + #define vcvtd_n_s64_f64(a, n) simde_vcvtd_n_s64_f64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtd_n_u64_f64(simde_float64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return simde_vcvtd_u64_f64(a * simde_math_pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_u64_f64(a, n) vcvtd_n_u64_f64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_u64_f64 + #define vcvtd_n_u64_f64(a, n) simde_vcvtd_n_u64_f64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vcvtd_n_f64_s64(int64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_f64_s64(a, n) vcvtd_n_f64_s64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_f64_s64 + #define vcvtd_n_f64_s64(a, n) simde_vcvtd_n_f64_s64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vcvtd_n_f64_u64(uint64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_f64_u64(a, n) vcvtd_n_f64_u64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_f64_u64 + #define vcvtd_n_f64_u64(a, n) simde_vcvtd_n_f64_u64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvt_n_s32_f32(simde_float32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); + } + + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_s32_f32(a, n) vcvt_n_s32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_s32_f32 + #define vcvt_n_s32_f32(a, n) simde_vcvt_n_s32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvt_n_s64_f64(simde_float64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * simde_math_pow(2, n)); + } + + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_s64_f64(a, n) vcvt_n_s64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_s64_f64 + #define vcvt_n_s64_f64(a, n) simde_vcvt_n_s64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvt_n_u16_f16(simde_float16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * + HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)))); + } + + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_u16_f16(a, n) vcvt_n_u16_f16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_n_u16_f16 + #define vcvt_n_u16_f16(a, n) simde_vcvt_n_u16_f16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvt_n_u32_f32(simde_float32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); + } + + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_u32_f32(a, n) vcvt_n_u32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_u32_f32 + #define vcvt_n_u32_f32(a, n) simde_vcvt_n_u32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvt_n_u64_f64(simde_float64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * simde_math_pow(2, n)); + } + + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvt_n_u64_f64(a, n) vcvt_n_u64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvt_n_u64_f64 + #define vcvt_n_u64_f64(a, n) simde_vcvt_n_u64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtq_n_s32_f32(simde_float32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_s32_f32(a, n) vcvtq_n_s32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_s32_f32 + #define vcvtq_n_s32_f32(a, n) simde_vcvtq_n_s32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtq_n_s64_f64(simde_float64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * simde_math_pow(2, n)); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_s64_f64(a, n) vcvtq_n_s64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_s64_f64 + #define vcvtq_n_s64_f64(a, n) simde_vcvtq_n_s64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtq_n_u16_f16(simde_float16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * + HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)))); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) +#define simde_vcvtq_n_u16_f16(a, n) vcvtq_n_u16_f16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtq_n_u16_f16 + #define vcvtq_n_u16_f16(a, n) simde_vcvtq_n_u16_f16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtq_n_u32_f32(simde_float32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))); + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvtq_n_u32_f32(a, n) vcvtq_n_u32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtq_n_u32_f32 + #define vcvtq_n_u32_f32(a, n) simde_vcvtq_n_u32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtq_n_u64_f64(simde_float64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * simde_math_pow(2, n)); + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvtq_n_u64_f64(a, n) vcvtq_n_u64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtq_n_u64_f64 + #define vcvtq_n_u64_f64(a, n) simde_vcvtq_n_u64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcvt_n_f16_u16(simde_uint16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_float16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n))); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_f16_u16(a, n) vcvt_n_f16_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_n_f16_u16 + #define vcvt_n_f16_u16(a, n) simde_vcvt_n_f16_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcvt_n_f16_s16(simde_int16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_float16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n))); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_f16_s16(a, n) vcvt_n_f16_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvt_n_f16_s16 + #define vcvt_n_f16_s16(a, n) simde_vcvt_n_f16_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvtq_n_f16_u16(simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_float16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n))); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvtq_n_f16_u16(a, n) vcvtq_n_f16_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtq_n_f16_u16 + #define vcvtq_n_f16_u16(a, n) simde_vcvtq_n_f16_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvtq_n_f16_s16(simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_float16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, (a_.values[i] / simde_math_pow(2, n)))); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvtq_n_f16_s16(a, n) vcvtq_n_f16_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtq_n_f16_s16 + #define vcvtq_n_f16_s16(a, n) simde_vcvtq_n_f16_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvt_n_f32_u32(simde_uint32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_f32_u32(a, n) vcvt_n_f32_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f32_u32 + #define vcvt_n_f32_u32(a, n) simde_vcvt_n_f32_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvt_n_f32_s32(simde_int32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_f32_s32(a, n) vcvt_n_f32_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f32_s32 + #define vcvt_n_f32_s32(a, n) simde_vcvt_n_f32_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcvt_n_f64_u64(simde_uint64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_float64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_f64_u64(a, n) vcvt_n_f64_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f64_u64 + #define vcvt_n_f64_u64(a, n) simde_vcvt_n_f64_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvtq_n_f64_u64(simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_float64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_f64_u64(a, n) vcvtq_n_f64_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f64_u64 + #define vcvtq_n_f64_u64(a, n) simde_vcvtq_n_f64_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcvt_n_f64_s64(simde_int64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_float64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_f64_s64(a, n) vcvt_n_f64_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f64_s64 + #define vcvt_n_f64_s64(a, n) simde_vcvt_n_f64_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvtq_n_f64_s64(simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_float64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_f64_s64(a, n) vcvtq_n_f64_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f64_s64 + #define vcvtq_n_f64_s64(a, n) simde_vcvtq_n_f64_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_n_f32_s32(simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_f32_s32(a, n) vcvtq_n_f32_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f32_s32 + #define vcvtq_n_f32_s32(a, n) simde_vcvtq_n_f32_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_n_f32_u32(simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)); + } + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_f32_u32(a, n) vcvtq_n_f32_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f32_u32 + #define vcvtq_n_f32_u32(a, n) simde_vcvtq_n_f32_u32((a), (n)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVT_N_H */ diff --git a/arm/neon/cvtm.h b/arm/neon/cvtm.h new file mode 100644 index 000000000..bf0aed7bb --- /dev/null +++ b/arm/neon/cvtm.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTM_H) +#define SIMDE_ARM_NEON_CVTM_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtmh_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmh_s64_f16 + #define vcvtmh_s64_f16(a) simde_vcvtmh_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtmh_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmh_s32_f16 + #define vcvtmh_s32_f16(a) simde_vcvtmh_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtmh_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmh_u64_f16 + #define vcvtmh_u64_f16(a) simde_vcvtmh_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtmh_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmh_u32_f16 + #define vcvtmh_u32_f16(a) simde_vcvtmh_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtmh_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmh_u16_f16 + #define vcvtmh_u16_f16(a) simde_vcvtmh_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtms_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtms_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtms_u32_f32 + #define vcvtms_u32_f32(a) simde_vcvtms_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtmd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtmd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmd_u64_f64 + #define vcvtmd_u64_f64(a) simde_vcvtmd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtmq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtmq_u16_f16 + #define vcvtmq_u16_f16(a) simde_vcvtmq_u16_f16(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtmq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtmq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtmq_u32_f32 + #define vcvtmq_u32_f32(a) simde_vcvtmq_u32_f32(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtmq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtmq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmq_u64_f64 + #define vcvtmq_u64_f64(a) simde_vcvtmq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtm_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtm_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtm_u16_f16 + #define vcvtm_u16_f16(a) simde_vcvtm_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtm_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtm_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtm_u32_f32 + #define vcvtm_u32_f32(a) simde_vcvtm_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtm_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtm_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtm_u64_f64 + #define vcvtm_u64_f64(a) simde_vcvtm_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTM_H */ diff --git a/arm/neon/cvtn.h b/arm/neon/cvtn.h new file mode 100644 index 000000000..1363989f8 --- /dev/null +++ b/arm/neon/cvtn.h @@ -0,0 +1,538 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTN_H) +#define SIMDE_ARM_NEON_CVTN_H + +#include "types.h" +#include "cvt.h" +#include "calt.h" +#include "cagt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtnq_s32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_s32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epi32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epi32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_.values[i])); + } + #endif + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_s32_f32 + #define vcvtnq_s32_f32(a) simde_vcvtnq_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtnq_s64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_s64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epi64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epi64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a_.values[i])); + } + #endif + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_s64_f64 + #define vcvtnq_s64_f64(a) simde_vcvtnq_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtnh_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnh_s64_f16 + #define vcvtnh_s64_f16(a) simde_vcvtnh_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtnh_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnh_s32_f16 + #define vcvtnh_s32_f16(a) simde_vcvtnh_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtnh_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnh_u64_f16 + #define vcvtnh_u64_f16(a) simde_vcvtnh_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtnh_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnh_u32_f16 + #define vcvtnh_u32_f16(a) simde_vcvtnh_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtnh_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnh_u16_f16 + #define vcvtnh_u16_f16(a) simde_vcvtnh_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtns_s32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtns_s32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a)); + #else + if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtns_s32_f32 + #define vcvtns_s32_f32(a) simde_vcvtns_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtns_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtns_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtns_u32_f32 + #define vcvtns_u32_f32(a) simde_vcvtns_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtnq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtnq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtnq_u32_f32 + #define vcvtnq_u32_f32(a) simde_vcvtnq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtnd_s64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnd_s64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a)); + #else + if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { + return INT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnd_s64_f64 + #define vcvtnd_s64_f64(a) simde_vcvtnd_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtnd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnd_u64_f64 + #define vcvtnd_u64_f64(a) simde_vcvtnd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtnq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_u64_f64 + #define vcvtnq_u64_f64(a) simde_vcvtnq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtnq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtnq_u16_f16 + #define vcvtnq_u16_f16(a) simde_vcvtnq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtn_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtn_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtn_u16_f16 + #define vcvtn_u16_f16(a) simde_vcvtn_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtn_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_u32_f32 + #define vcvtn_u32_f32(a) simde_vcvtn_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvtn_s32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_s32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_s32_f32(a_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_s32_f32 + #define vcvtn_s32_f32(a) simde_vcvtn_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvtn_s64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_s64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_s64_f64(a_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_s64_f64 + #define vcvtn_s64_f64(a) simde_vcvtn_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtn_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_u64_f64 + #define vcvtn_u64_f64(a) simde_vcvtn_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTN_H */ diff --git a/arm/neon/cvtp.h b/arm/neon/cvtp.h new file mode 100644 index 000000000..5c29f5dbc --- /dev/null +++ b/arm/neon/cvtp.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTP_H) +#define SIMDE_ARM_NEON_CVTP_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtph_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtph_s64_f16 + #define vcvtph_s64_f16(a) simde_vcvtph_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtph_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtph_s32_f16 + #define vcvtph_s32_f16(a) simde_vcvtph_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtph_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtph_u64_f16 + #define vcvtph_u64_f16(a) simde_vcvtph_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtph_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtph_u32_f16 + #define vcvtph_u32_f16(a) simde_vcvtph_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtph_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtph_u16_f16 + #define vcvtph_u16_f16(a) simde_vcvtph_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtps_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtps_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtps_u32_f32 + #define vcvtps_u32_f32(a) simde_vcvtps_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtpd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtpd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpd_u64_f64 + #define vcvtpd_u64_f64(a) simde_vcvtpd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtpq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtpq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtpq_u16_f16 + #define vcvtpq_u16_f16(a) simde_vcvtpq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtpq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtpq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) + #undef vcvtpq_u32_f32 + #define vcvtpq_u32_f32(a) simde_vcvtpq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtpq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtpq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpq_u64_f64 + #define vcvtpq_u64_f64(a) simde_vcvtpq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtp_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtp_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vcvtp_u16_f16 + #define vcvtp_u16_f16(a) simde_vcvtp_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtp_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtp_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtp_u32_f32 + #define vcvtp_u32_f32(a) simde_vcvtp_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtp_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtp_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtp_u64_f64 + #define vcvtp_u64_f64(a) simde_vcvtp_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTP_H */ diff --git a/arm/neon/div.h b/arm/neon/div.h new file mode 100644 index 000000000..fed252dbf --- /dev/null +++ b/arm/neon/div.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_DIV_H) +#define SIMDE_ARM_NEON_DIV_H + +#include "types.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vdivh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdivh_f16(a, b); + #else + return simde_float16_from_float32(simde_float16_to_float32(a) / simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdivh_f16 + #define vdivh_f16(a, b) simde_vdivh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vdiv_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdiv_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdiv_f16 + #define vdiv_f16(a, b) simde_vdiv_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdivq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdivq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdivq_f16 + #define vdivq_f16(a, b) simde_vdivq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vdiv_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdiv_f32(a, b); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdiv_f32 + #define vdiv_f32(a, b) simde_vdiv_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vdivq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdivq_f32(a, b); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivq_f32 + #define vdivq_f32(a, b) simde_vdivq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vdiv_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdiv_f64(a, b); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdiv_f64 + #define vdiv_f64(a, b) simde_vdiv_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vdivq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdivq_f64(a, b); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivq_f64 + #define vdivq_f64(a, b) simde_vdivq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_DIV_H) */ diff --git a/arm/neon/dot.h b/arm/neon/dot.h index fa7febe03..6ebe7d6da 100644 --- a/arm/neon/dot.h +++ b/arm/neon/dot.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DOT_H) @@ -46,7 +47,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_int32x2_t simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdot_s32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vadd_s32(r, simde_vmovn_s64(simde_vpaddlq_s32(simde_vpaddlq_s16(simde_vmull_s8(a, b))))); @@ -55,19 +56,36 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { simde_int8x8_private a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); - } - r_.values[i] = acc; - } - return simde_vadd_s32(r, simde_int32x2_from_private(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint16m2_t vd_low = __riscv_vwmul_vv_i16m2 (a_.sv64, b_.sv64, 8); + vint16m2_t vd_high = __riscv_vslidedown_vx_i16m2(vd_low, 4, 8); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t vd_low_wide = __riscv_vwcvt_x_x_v_i32m1 (__riscv_vlmul_trunc_v_i16m2_i16mf2(vd_low), 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(vd_low_wide, vd, 4); + vint32m1_t vd_high_wide = __riscv_vwcvt_x_x_v_i32m1 (__riscv_vlmul_trunc_v_i16m2_i16mf2(vd_high), 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(vd_high_wide, vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + return simde_int32x2_from_private(r_); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif + return simde_vadd_s32(r, simde_int32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_s32 #define vdot_s32(r, a, b) simde_vdot_s32((r), (a), (b)) #endif @@ -75,7 +93,7 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdot_u32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vadd_u32(r, simde_vmovn_u64(simde_vpaddlq_u32(simde_vpaddlq_u16(simde_vmull_u8(a, b))))); @@ -85,19 +103,36 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); - } - r_.values[i] = acc; - } + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint16m2_t vd_low = __riscv_vwmulu_vv_u16m2 (a_.sv64, b_.sv64, 8); + vuint16m2_t vd_high = __riscv_vslidedown_vx_u16m2(vd_low, 4, 8); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t vd_low_wide = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vlmul_trunc_v_u16m2_u16mf2(vd_low), 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(vd_low_wide, vd, 4); + vuint32m1_t vd_high_wide = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vlmul_trunc_v_u16m2_u16mf2(vd_high), 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(vd_high_wide, vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + return simde_uint32x2_from_private(r_); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif return simde_vadd_u32(r, simde_uint32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_u32 #define vdot_u32(r, a, b) simde_vdot_u32((r), (a), (b)) #endif @@ -105,7 +140,7 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdotq_s32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vaddq_s32(r, @@ -116,19 +151,38 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { simde_int8x16_private a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b); - for (int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); - } - r_.values[i] = acc; - } + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint16m2_t vd_low = __riscv_vwmul_vv_i16m2 (a_.sv128, b_.sv128, 16); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + vd_low), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 4, 4)), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 8, 4)), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 12, 4)), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + return simde_int32x4_from_private(r_); + #else + for (int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_s32 #define vdotq_s32(r, a, b) simde_vdotq_s32((r), (a), (b)) #endif @@ -136,7 +190,7 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdotq_u32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vaddq_u32(r, @@ -147,23 +201,98 @@ simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - for (int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint16m2_t vd_low = __riscv_vwmulu_vv_u16m2 (a_.sv128, b_.sv128, 16); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + vd_low), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 4, 4)), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 8, 4)), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 12, 4)), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + return simde_uint32x4_from_private(r_); + #else + for (int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + } + r_.values[i] = acc; } - r_.values[i] = acc; - } + #endif return simde_vaddq_u32(r, simde_uint32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_u32 #define vdotq_u32(r, a, b) simde_vdotq_u32((r), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfdot_f32(r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private + a_ = simde_bfloat16x4_to_private(a), + b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdot_f32 + #define vbfdot_f32(r, a, b) simde_vbfdot_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfdotq_f32(r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdotq_f32 + #define vbfdotq_f32(r, a, b) simde_vbfdotq_f32((r), (a), (b)) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/dot_lane.h b/arm/neon/dot_lane.h index 84f706948..0cc312b35 100644 --- a/arm/neon/dot_lane.h +++ b/arm/neon/dot_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DOT_LANE_H) @@ -45,7 +46,7 @@ simde_int32x2_t simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_int32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdot_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t @@ -69,24 +70,38 @@ simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const simde_int8x8_private a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(b_.sv64, lane*4, 4)); + vint16mf2_t vd_high = __riscv_vwmul_vv_i16mf2(__riscv_vlmul_trunc_v_i8m1_i8mf4 \ + (__riscv_vslidedown_vx_i8m1(a_.sv64, 4, 4)), vb_low, 4); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (vd_high, 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x2_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_lane_s32 #define vdot_lane_s32(r, a, b, lane) simde_vdot_lane_s32((r), (a), (b), (lane)) #endif @@ -96,7 +111,7 @@ simde_uint32x2_t simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_uint32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdot_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t @@ -120,24 +135,38 @@ simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, co simde_uint8x8_private a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4 ( + __riscv_vslidedown_vx_u8m1(b_.sv64, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x2_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_lane_u32 #define vdot_lane_u32(r, a, b, lane) simde_vdot_lane_u32((r), (a), (b), (lane)) #endif @@ -147,7 +176,7 @@ simde_int32x2_t simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_int32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdot_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t b_lane; @@ -169,24 +198,38 @@ simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, con simde_int32x2_private r_ = simde_int32x2_to_private(r); simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv128, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (__riscv_vwmul_vv_i16mf2 ( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (__riscv_vwmul_vv_i16mf2 ( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x2_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_laneq_s32 #define vdot_laneq_s32(r, a, b, lane) simde_vdot_laneq_s32((r), (a), (b), (lane)) #endif @@ -196,7 +239,7 @@ simde_uint32x2_t simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_uint32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdot_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t b_lane; @@ -218,23 +261,37 @@ simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, simde_uint32x2_private r_ = simde_uint32x2_to_private(r); simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4 ( + __riscv_vslidedown_vx_u8m1(b_.sv128, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4 (__riscv_vslidedown_vx_u8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4 (__riscv_vslidedown_vx_u8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x2_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_laneq_u32 #define vdot_laneq_u32(r, a, b, lane) simde_vdot_laneq_u32((r), (a), (b), (lane)) #endif @@ -244,7 +301,7 @@ simde_uint32x4_t simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_uint32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdotq_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x4_t @@ -280,23 +337,40 @@ simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b simde_uint8x16_private a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4( + __riscv_vslidedown_vx_u8m1(b_.sv128, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x4_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_laneq_u32 #define vdotq_laneq_u32(r, a, b, lane) simde_vdotq_laneq_u32((r), (a), (b), (lane)) #endif @@ -306,7 +380,7 @@ simde_int32x4_t simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_int32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdotq_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x4_t @@ -342,23 +416,44 @@ simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, c simde_int8x16_private a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv128, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 0, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 4, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 8, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 12, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x4_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_laneq_s32 #define vdotq_laneq_s32(r, a, b, lane) simde_vdotq_laneq_s32((r), (a), (b), (lane)) #endif @@ -368,7 +463,7 @@ simde_uint32x4_t simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_uint32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdotq_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t @@ -403,23 +498,39 @@ simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, simde_uint32x4_private r_ = simde_uint32x4_to_private(r); simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(b_.sv64, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x4_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_lane_u32 #define vdotq_lane_u32(r, a, b, lane) simde_vdotq_lane_u32((r), (a), (b), (lane)) #endif @@ -429,7 +540,7 @@ simde_int32x4_t simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_int32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdotq_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t @@ -464,27 +575,178 @@ simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, con simde_int32x4_private r_ = simde_int32x4_to_private(r); simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv64, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x4_from_private(r_); #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_lane_s32 #define vdotq_lane_s32(r, a, b, lane) simde_vdotq_lane_s32((r), (a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_lane_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_2_(vbfdot_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private + a_ = simde_bfloat16x4_to_private(a), + b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x2_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdot_lane_f32 + #define vbfdot_lane_f32(r, a, b, lane) simde_vbfdot_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x4_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_2_(vbfdotq_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x4_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdotq_lane_f32 + #define vbfdotq_lane_f32(r, a, b, lane) simde_vbfdotq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_laneq_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vbfdot_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x2_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdot_laneq_f32 + #define vbfdot_laneq_f32(r, a, b, lane) simde_vbfdot_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vbfdotq_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x4_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfdotq_laneq_f32 + #define vbfdotq_laneq_f32(r, a, b, lane) simde_vbfdotq_laneq_f32((r), (a), (b), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/dup_lane.h b/arm/neon/dup_lane.h index bc1720518..4d013b109 100644 --- a/arm/neon/dup_lane.h +++ b/arm/neon/dup_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_DUP_LANE_H) @@ -146,6 +147,63 @@ simde_vdupd_lane_u64(simde_uint64x1_t vec, const int lane) #define vdupd_lane_u64(vec, lane) simde_vdupd_lane_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vduph_lane_f16(simde_float16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_float16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vduph_lane_f16(vec, lane) vduph_lane_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vduph_lane_f16 + #define vduph_lane_f16(vec, lane) simde_vduph_lane_f16((vec), (lane)) +#endif + +// simde_vdup_lane_f16 +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdup_lane_f16(vec, lane) vdup_lane_f16(vec, lane) +#else + #define simde_vdup_lane_f16(vec, lane) simde_vdup_n_f16(simde_vduph_lane_f16(vec, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdup_lane_f16 + #define vdup_lane_f16(vec, lane) simde_vdup_lane_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vdup_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_f16(simde_float16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdup_laneq_f16(vec, lane) vdup_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdup_laneq_f16 + #define vdup_laneq_f16(vec, lane) simde_vdup_laneq_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdupq_lane_f16(simde_float16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_f16(simde_float16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdupq_lane_f16(vec, lane) vdupq_lane_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdupq_lane_f16 + #define vdupq_lane_f16(vec, lane) simde_vdupq_lane_f16((vec), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float64_t simde_vdupd_lane_f64(simde_float64x1_t vec, const int lane) @@ -924,6 +982,21 @@ simde_vdupq_lane_u64(simde_uint64x1_t vec, const int lane) #define vdupq_lane_u64(vec, lane) simde_vdupq_lane_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdupq_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_f16(simde_float16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdupq_laneq_f16(vec, lane) vdupq_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdupq_laneq_f16 + #define vdupq_laneq_f16(vec, lane) simde_vdupq_laneq_f16((vec), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vdupq_laneq_f32(simde_float32x4_t vec, const int lane) @@ -1194,6 +1267,448 @@ simde_vdupq_laneq_u64(simde_uint64x2_t vec, const int lane) #define vdupq_laneq_u64(vec, lane) simde_vdupq_laneq_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vdupb_lane_s8(simde_int8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_int8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_lane_s8(vec, lane) vdupb_lane_s8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_lane_s8 + #define vdupb_lane_s8(vec, lane) simde_vdupb_lane_s8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vdupb_lane_u8(simde_uint8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_uint8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_lane_u8(vec, lane) vdupb_lane_u8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_lane_u8 + #define vdupb_lane_u8(vec, lane) simde_vdupb_lane_u8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vdupb_laneq_s8(simde_int8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_int8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_laneq_s8(vec, lane) vdupb_laneq_s8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_laneq_s8 + #define vdupb_laneq_s8(vec, lane) simde_vdupb_laneq_s8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vdupb_laneq_u8(simde_uint8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_uint8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_laneq_u8(vec, lane) vdupb_laneq_u8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_laneq_u8 + #define vdupb_laneq_u8(vec, lane) simde_vdupb_laneq_u8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vduph_lane_s16(simde_int16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_int16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_lane_s16(vec, lane) vduph_lane_s16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_s16 + #define vduph_lane_s16(vec, lane) simde_vduph_lane_s16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vduph_lane_u16(simde_uint16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_uint16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_lane_u16(vec, lane) vduph_lane_u16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_u16 + #define vduph_lane_u16(vec, lane) simde_vduph_lane_u16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vduph_laneq_s16(simde_int16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_int16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_laneq_s16(vec, lane) vduph_laneq_s16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_s16 + #define vduph_laneq_s16(vec, lane) simde_vduph_laneq_s16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vduph_laneq_u16(simde_uint16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_uint16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_laneq_u16(vec, lane) vduph_laneq_u16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_u16 + #define vduph_laneq_u16(vec, lane) simde_vduph_laneq_u16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vduph_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_float16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vduph_laneq_f16(vec, lane) vduph_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vduph_laneq_f16 + #define vduph_laneq_f16(vec, lane) simde_vduph_laneq_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_p8(simde_poly8x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdup_lane_p8(vec, lane) vdup_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p8 + #define vdup_lane_p8(vec, lane) simde_vdup_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdup_n_p16(simde_poly16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdup_lane_p16(vec, lane) vdup_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p16 + #define vdup_lane_p16(vec, lane) simde_vdup_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_lane_p64(simde_poly64x1_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return simde_vdup_n_p64(simde_poly64x1_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vdup_lane_p64(vec, lane) vdup_lane_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p64 + #define vdup_lane_p64(vec, lane) simde_vdup_lane_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_vdup_n_p8(simde_poly8x16_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p8(vec, lane) vdup_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p8 + #define vdup_laneq_p8(vec, lane) simde_vdup_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_p16(simde_poly16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p16(vec, lane) vdup_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p16 + #define vdup_laneq_p16(vec, lane) simde_vdup_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_laneq_p64(simde_poly64x2_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vdup_n_p64(simde_poly64x2_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p64(vec, lane) vdup_laneq_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p64 + #define vdup_laneq_p64(vec, lane) simde_vdup_laneq_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_p8(simde_poly8x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdupq_lane_p8(vec, lane) vdupq_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p8 + #define vdupq_lane_p8(vec, lane) simde_vdupq_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_p16(simde_poly16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdupq_lane_p16(vec, lane) vdupq_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p16 + #define vdupq_lane_p16(vec, lane) simde_vdupq_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_lane_p64(simde_poly64x1_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return simde_vdupq_n_p64(simde_poly64x1_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vdupq_lane_p64(vec, lane) vdupq_lane_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p64 + #define vdupq_lane_p64(vec, lane) simde_vdupq_lane_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_vdupq_n_p8(simde_poly8x16_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p8(vec, lane) vdupq_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p8 + #define vdupq_laneq_p8(vec, lane) simde_vdupq_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_p16(simde_poly16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p16(vec, lane) vdupq_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p16 + #define vdupq_laneq_p16(vec, lane) simde_vdupq_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_laneq_p64(simde_poly64x2_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vdupq_n_p64(simde_poly64x2_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p64(vec, lane) vdupq_laneq_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p64 + #define vdupq_laneq_p64(vec, lane) simde_vdupq_laneq_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vdupb_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_poly8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vdupb_lane_p8(vec, lane) vdupb_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vdupb_lane_p8 + #define vdupb_lane_p8(vec, lane) simde_vdupb_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vdupb_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_poly8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vdupb_laneq_p8(vec, lane) vdupb_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vdupb_laneq_p8 + #define vdupb_laneq_p8(vec, lane) simde_vdupb_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vduph_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_poly16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vduph_lane_p16(vec, lane) vduph_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vduph_lane_p16 + #define vduph_lane_p16(vec, lane) simde_vduph_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vduph_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_poly16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vduph_laneq_p16(vec, lane) vduph_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vduph_laneq_p16 + #define vduph_laneq_p16(vec, lane) simde_vduph_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vduph_lane_bf16(simde_bfloat16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_bfloat16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vduph_lane_bf16(vec, lane) vduph_lane_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vduph_lane_bf16 + #define vduph_lane_bf16(vec, lane) simde_vduph_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vduph_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_bfloat16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vduph_laneq_bf16(vec, lane) vduph_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vduph_laneq_bf16 + #define vduph_laneq_bf16(vec, lane) simde_vduph_laneq_bf16((vec), (lane)) +#endif + +// simde_vdup_lane_bf16 +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vdup_lane_bf16(vec, lane) vdup_lane_bf16(vec, lane) +#else + #define simde_vdup_lane_bf16(vec, lane) simde_vdup_n_bf16(simde_vduph_lane_bf16(vec, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vdup_lane_bf16 + #define vdup_lane_bf16(vec, lane) simde_vdup_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vdup_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) +#define simde_vdup_laneq_bf16(vec, lane) vdup_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vdup_laneq_bf16 + #define vdup_laneq_bf16(vec, lane) simde_vdup_laneq_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_lane_bf16(simde_bfloat16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_bf16(simde_bfloat16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) +#define simde_vdupq_lane_bf16(vec, lane) vdupq_lane_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vdupq_lane_bf16 + #define vdupq_lane_bf16(vec, lane) simde_vdupq_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vdupq_laneq_bf16(vec, lane) vdupq_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vdupq_laneq_bf16 + #define vdupq_laneq_bf16(vec, lane) simde_vdupq_laneq_bf16((vec), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/dup_n.h b/arm/neon/dup_n.h index e945e99c9..663667ce8 100644 --- a/arm/neon/dup_n.h +++ b/arm/neon/dup_n.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Sean Maher (Copyright owned by Google, LLC) * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DUP_N_H) @@ -36,22 +38,25 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t -simde_vdup_n_f16(simde_float16 value) { +simde_vdup_n_f16(simde_float16_t value) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vdup_n_f16(value); #else simde_float16x4_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfmv_v_f_f16m1 (value, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float16x4_from_private(r_); #endif } #define simde_vmov_n_f16 simde_vdup_n_f16 -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdup_n_f16 #define vdup_n_f16(value) simde_vdup_n_f16((value)) #undef vmov_n_f16 @@ -65,12 +70,14 @@ simde_vdup_n_f32(float value) { return vdup_n_f32(value); #else simde_float32x2_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmv_v_f_f32m1(value, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float32x2_from_private(r_); #endif } @@ -89,12 +96,14 @@ simde_vdup_n_f64(double value) { return vdup_n_f64(value); #else simde_float64x1_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmv_v_f_f64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float64x1_from_private(r_); #endif } @@ -116,11 +125,13 @@ simde_vdup_n_s8(int8_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi8(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i8m1(value, 8); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } #endif return simde_int8x8_from_private(r_); @@ -144,6 +155,8 @@ simde_vdup_n_s16(int16_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi16(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i16m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -172,6 +185,8 @@ simde_vdup_n_s32(int32_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi32(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i32m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -198,11 +213,14 @@ simde_vdup_n_s64(int64_t value) { #else simde_int64x1_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_int64x1_from_private(r_); #endif } @@ -224,6 +242,8 @@ simde_vdup_n_u8(uint8_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi8(HEDLEY_STATIC_CAST(int8_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u8m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -252,6 +272,8 @@ simde_vdup_n_u16(uint16_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi16(HEDLEY_STATIC_CAST(int16_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u16m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -280,6 +302,8 @@ simde_vdup_n_u32(uint32_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi32(HEDLEY_STATIC_CAST(int32_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u32m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -305,12 +329,14 @@ simde_vdup_n_u64(uint64_t value) { return vdup_n_u64(value); #else simde_uint64x1_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_uint64x1_from_private(r_); #endif } @@ -324,22 +350,25 @@ simde_vdup_n_u64(uint64_t value) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t -simde_vdupq_n_f16(simde_float16 value) { +simde_vdupq_n_f16(simde_float16_t value) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vdupq_n_f16(value); #else simde_float16x8_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfmv_v_f_f16m1(value, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float16x8_from_private(r_); #endif } -#define simde_vmovq_n_f32 simde_vdupq_n_f32 -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#define simde_vmovq_n_f16 simde_vdupq_n_f16 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdupq_n_f16 #define vdupq_n_f16(value) simde_vdupq_n_f16((value)) #undef vmovq_n_f16 @@ -361,6 +390,8 @@ simde_vdupq_n_f32(float value) { r_.m128 = _mm_set1_ps(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmv_v_f_f32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -394,6 +425,8 @@ simde_vdupq_n_f64(double value) { r_.m128d = _mm_set1_pd(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmv_v_f_f64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -426,6 +459,8 @@ simde_vdupq_n_s8(int8_t value) { r_.m128i = _mm_set1_epi8(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i8m1(value, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -458,6 +493,8 @@ simde_vdupq_n_s16(int16_t value) { r_.m128i = _mm_set1_epi16(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i16m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -490,6 +527,8 @@ simde_vdupq_n_s32(int32_t value) { r_.m128i = _mm_set1_epi32(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -522,6 +561,8 @@ simde_vdupq_n_s64(int64_t value) { r_.m128i = _mm_set1_epi64x(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -554,6 +595,8 @@ simde_vdupq_n_u8(uint8_t value) { r_.m128i = _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_splat(HEDLEY_STATIC_CAST(int8_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u8m1(value, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -586,6 +629,8 @@ simde_vdupq_n_u16(uint16_t value) { r_.m128i = _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u16m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -618,6 +663,8 @@ simde_vdupq_n_u32(uint32_t value) { r_.m128i = _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_splat(HEDLEY_STATIC_CAST(int32_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -650,6 +697,8 @@ simde_vdupq_n_u64(uint64_t value) { r_.m128i = _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_splat(HEDLEY_STATIC_CAST(int64_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -668,6 +717,188 @@ simde_vdupq_n_u64(uint64_t value) { #define vmovq_n_u64(value) simde_vmovq_n_u64((value)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_n_p8(simde_poly8_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdup_n_p8(value); + #else + simde_poly8x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#define simde_vmov_n_p8 simde_vdup_n_p8 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p8 + #define vdup_n_p8(value) simde_vdup_n_p8((value)) + #undef vmov_n_p8 + #define vmov_n_p8(value) simde_vmov_n_p8((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_n_p16(simde_poly16_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdup_n_p16(value); + #else + simde_poly16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#define simde_vmov_n_p16 simde_vdup_n_p16 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p16 + #define vdup_n_p16(value) simde_vdup_n_p16((value)) + #undef vmov_n_p16 + #define vmov_n_p16(value) simde_vmov_n_p16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_n_p64(simde_poly64_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vdup_n_p64(value); + #else + simde_poly64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p64 + #define vdup_n_p64(value) simde_vdup_n_p64((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_n_p8(simde_poly8_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_p8(value); + #else + simde_poly8x16_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#define simde_vmovq_n_p8 simde_vdupq_n_p8 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p8 + #define vdupq_n_p8(value) simde_vdupq_n_p8((value)) + #undef vmovq_n_p8 + #define vmovq_n_p8(value) simde_vmovq_n_p8((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_n_p16(simde_poly16_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_p16(value); + #else + simde_poly16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#define simde_vmovq_n_p16 simde_vdupq_n_p16 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p16 + #define vdupq_n_p16(value) simde_vdupq_n_p16((value)) + #undef vmovq_n_p16 + #define vmovq_n_p16(value) simde_vmovq_n_p16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_n_p64(simde_poly64_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vdupq_n_p64(value); + #else + simde_poly64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p64 + #define vdupq_n_p64(value) simde_vdupq_n_p64((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vdup_n_bf16(simde_bfloat16_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vdup_n_bf16(value); + #else + simde_bfloat16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdup_n_bf16 + #define vdup_n_bf16(value) simde_vdup_n_bf16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_n_bf16(simde_bfloat16_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vdupq_n_bf16(value); + #else + simde_bfloat16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vdupq_n_bf16 + #define vdupq_n_bf16(value) simde_vdupq_n_bf16((value)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/eor.h b/arm/neon/eor.h index bf5a66d3b..50791d1c9 100644 --- a/arm/neon/eor.h +++ b/arm/neon/eor.h @@ -23,6 +23,9 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_EOR_H) @@ -47,6 +50,8 @@ simde_veor_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -77,6 +82,8 @@ simde_veor_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -107,6 +114,8 @@ simde_veor_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -137,6 +146,8 @@ simde_veor_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -167,6 +178,8 @@ simde_veor_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -197,6 +210,8 @@ simde_veor_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -227,6 +242,8 @@ simde_veor_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -257,6 +274,8 @@ simde_veor_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -291,6 +310,8 @@ simde_veorq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -325,6 +346,8 @@ simde_veorq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -359,6 +382,8 @@ simde_veorq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -393,6 +418,8 @@ simde_veorq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -427,6 +454,8 @@ simde_veorq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -461,6 +490,8 @@ simde_veorq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -495,6 +526,8 @@ simde_veorq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -529,6 +562,8 @@ simde_veorq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -546,6 +581,263 @@ simde_veorq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define veorq_u64(a, b) simde_veorq_u64((a), (b)) #endif +// Note: EOR3 instructions are implemented only when FEAT_SHA3 is implemented. +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_veor3q_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s8(a, b, c); + #else + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i8m1(__riscv_vxor_vv_i8m1(a_.sv128, b_.sv128, 16), c_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_s8 + #define veor3q_s8(a, b, c) simde_veor3q_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_veor3q_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i16m1(__riscv_vxor_vv_i16m1(a_.sv128, b_.sv128, 8), c_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_s16 + #define veor3q_s16(a, b, c) simde_veor3q_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_veor3q_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i32m1(__riscv_vxor_vv_i32m1(a_.sv128, b_.sv128, 4), c_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_s32 + #define veor3q_s32(a, b, c) simde_veor3q_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_veor3q_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s64(a, b, c); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b), + c_ = simde_int64x2_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i64m1(__riscv_vxor_vv_i64m1(a_.sv128, b_.sv128, 2), c_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_s64 + #define veor3q_s64(a, b, c) simde_veor3q_s64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_veor3q_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u8(a, b, c); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u8m1(__riscv_vxor_vv_u8m1(a_.sv128, b_.sv128, 16), c_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_u8 + #define veor3q_u8(a, b, c) simde_veor3q_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_veor3q_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u16(a, b, c); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u16m1(__riscv_vxor_vv_u16m1(a_.sv128, b_.sv128, 8), c_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_u16 + #define veor3q_u16(a, b, c) simde_veor3q_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_veor3q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u32m1(__riscv_vxor_vv_u32m1(a_.sv128, b_.sv128, 4), c_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_u32 + #define veor3q_u32(a, b, c) simde_veor3q_u32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_veor3q_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u64(a, b, c); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b), + c_ = simde_uint64x2_to_private(c); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u64m1(__riscv_vxor_vv_u64m1(a_.sv128, b_.sv128, 2), c_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) + #undef veor3q_u64 + #define veor3q_u64(a, b, c) simde_veor3q_u64((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/ext.h b/arm/neon/ext.h index 0768e9d1a..9e4740470 100644 --- a/arm/neon/ext.h +++ b/arm/neon/ext.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_EXT_H) @@ -33,6 +35,38 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + simde_float16x4_t r; + SIMDE_CONSTIFY_4_(vext_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + r_ = a_; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_.sv64 = __riscv_vslidedown_vx_f16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_f16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vext_f16 + #define vext_f16(a, b, n) simde_vext_f16((a), (b), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n) @@ -46,15 +80,20 @@ simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_f32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_f32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_f32(a, b, n) simde_float32x2_from_m64(_mm_alignr_pi8(simde_float32x2_to_m64(b), simde_float32x2_to_m64(a), n * sizeof(simde_float32))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_f32(a, b, n) (__extension__ ({ \ @@ -81,15 +120,20 @@ simde_vext_f64(simde_float64x1_t a, simde_float64x1_t b, const int n) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_f64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_f64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_f64(a, b, n) simde_float64x1_from_m64(_mm_alignr_pi8(simde_float64x1_to_m64(b), simde_float64x1_to_m64(a), n * sizeof(simde_float64))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_f64(a, b, n) (__extension__ ({ \ @@ -117,15 +161,20 @@ simde_vext_s8(simde_int8x8_t a, simde_int8x8_t b, const int n) a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i8m1(a_.sv64, n, 8); + r_.sv64 = __riscv_vslideup_vx_i8m1(a_.sv64, b_.sv64, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_int8x8_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s8(a, b, n) simde_int8x8_from_m64(_mm_alignr_pi8(simde_int8x8_to_m64(b), simde_int8x8_to_m64(a), n * sizeof(int8_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s8(a, b, n) (__extension__ ({ \ @@ -156,15 +205,20 @@ simde_vext_s16(simde_int16x4_t a, simde_int16x4_t b, const int n) a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_i16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s16(a, b, n) simde_int16x4_from_m64(_mm_alignr_pi8(simde_int16x4_to_m64(b), simde_int16x4_to_m64(a), n * sizeof(int16_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s16(a, b, n) (__extension__ ({ \ @@ -193,15 +247,20 @@ simde_vext_s32(simde_int32x2_t a, simde_int32x2_t b, const int n) a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_i32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s32(a, b, n) simde_int32x2_from_m64(_mm_alignr_pi8(simde_int32x2_to_m64(b), simde_int32x2_to_m64(a), n * sizeof(int32_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s32(a, b, n) (__extension__ ({ \ @@ -228,15 +287,20 @@ simde_vext_s64(simde_int64x1_t a, simde_int64x1_t b, const int n) a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_i64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_int64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s64(a, b, n) simde_int64x1_from_m64(_mm_alignr_pi8(simde_int64x1_to_m64(b), simde_int64x1_to_m64(a), n * sizeof(int64_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_s64(a, b, n) (__extension__ ({ \ @@ -264,15 +328,20 @@ simde_vext_u8(simde_uint8x8_t a, simde_uint8x8_t b, const int n) a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u8m1(a_.sv64, n, 8); + r_.sv64 = __riscv_vslideup_vx_u8m1(a_.sv64, b_.sv64, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_uint8x8_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u8(a, b, n) simde_uint8x8_from_m64(_mm_alignr_pi8(simde_uint8x8_to_m64(b), simde_uint8x8_to_m64(a), n * sizeof(uint8_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u8(a, b, n) (__extension__ ({ \ @@ -303,15 +372,20 @@ simde_vext_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int n) a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_u16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u16(a, b, n) simde_uint16x4_from_m64(_mm_alignr_pi8(simde_uint16x4_to_m64(b), simde_uint16x4_to_m64(a), n * sizeof(uint16_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u16(a, b, n) (__extension__ ({ \ @@ -340,15 +414,20 @@ simde_vext_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int n) a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_u32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u32(a, b, n) simde_uint32x2_from_m64(_mm_alignr_pi8(simde_uint32x2_to_m64(b), simde_uint32x2_to_m64(a), n * sizeof(uint32_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u32(a, b, n) (__extension__ ({ \ @@ -375,15 +454,20 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n) a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_u64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u64(a, b, n) simde_uint64x1_from_m64(_mm_alignr_pi8(simde_uint64x1_to_m64(b), simde_uint64x1_to_m64(a), n * sizeof(uint64_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_u64(a, b, n) (__extension__ ({ \ @@ -398,6 +482,38 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n) #define vext_u64(a, b, n) simde_vext_u64((a), (b), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + simde_float16x8_t r; + SIMDE_CONSTIFY_8_(vextq_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + r_ = a_; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_.sv128 = __riscv_vslidedown_vx_f16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_f16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vextq_f16 + #define vextq_f16(a, b, n) simde_vextq_f16((a), (b), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) @@ -411,16 +527,29 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_f32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_f32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_float32x4_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) - #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), n * sizeof(simde_float32)))) + #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), (n) * sizeof(simde_float32)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_f32(a, b, n) (__extension__ ({ \ + simde_float32x4_private simde_vextq_f32_r_; \ + simde_vextq_f32_r_.v128 = wasm_i32x4_shuffle(simde_float32x4_to_private(a).v128, simde_float32x4_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ + simde_float32x4_from_private(simde_vextq_f32_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_f32(a, b, n) (__extension__ ({ \ simde_float32x4_private simde_vextq_f32_r_; \ @@ -448,16 +577,28 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_f64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_f64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_float64x2_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) - #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), n * sizeof(simde_float64)))) + #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), (n) * sizeof(simde_float64)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_f64(a, b, n) (__extension__ ({ \ + simde_float64x2_private simde_vextq_f64_r_; \ + simde_vextq_f64_r_.v128 = wasm_i64x2_shuffle(simde_float64x2_to_private(a).v128, simde_float64x2_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ + simde_float64x2_from_private(simde_vextq_f64_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_f64(a, b, n) (__extension__ ({ \ simde_float64x2_private simde_vextq_f64_r_; \ @@ -484,16 +625,35 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128, n, 16); + r_.sv128 = __riscv_vslideup_vx_i8m1(a_.sv128, b_.sv128, 16-n, 16); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + #endif return simde_int8x16_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s8(a, b, n) simde_int8x16_from_m128i(_mm_alignr_epi8(simde_int8x16_to_m128i(b), simde_int8x16_to_m128i(a), n * sizeof(int8_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s8(a, b, n) (__extension__ ({ \ + simde_int8x16_private simde_vextq_s8_r_; \ + simde_vextq_s8_r_.v128 = wasm_i8x16_shuffle(simde_int8x16_to_private(a).v128, simde_int8x16_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 8)), HEDLEY_STATIC_CAST(int8_t, ((n) + 9)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 10)), HEDLEY_STATIC_CAST(int8_t, ((n) + 11)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 12)), HEDLEY_STATIC_CAST(int8_t, ((n) + 13)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \ + simde_int8x16_from_private(simde_vextq_s8_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s8(a, b, n) (__extension__ ({ \ simde_int8x16_private simde_vextq_s8_r_; \ @@ -527,16 +687,31 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_i16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_int16x8_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s16(a, b, n) simde_int16x8_from_m128i(_mm_alignr_epi8(simde_int16x8_to_m128i(b), simde_int16x8_to_m128i(a), n * sizeof(int16_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s16(a, b, n) (__extension__ ({ \ + simde_int16x8_private simde_vextq_s16_r_; \ + simde_vextq_s16_r_.v128 = wasm_i16x8_shuffle(simde_int16x8_to_private(a).v128, simde_int16x8_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ + simde_int16x8_from_private(simde_vextq_s16_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s16(a, b, n) (__extension__ ({ \ simde_int16x8_private simde_vextq_s16_r_; \ @@ -566,16 +741,29 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_i32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_int32x4_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s32(a, b, n) simde_int32x4_from_m128i(_mm_alignr_epi8(simde_int32x4_to_m128i(b), simde_int32x4_to_m128i(a), n * sizeof(int32_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s32(a, b, n) (__extension__ ({ \ + simde_int32x4_private simde_vextq_s32_r_; \ + simde_vextq_s32_r_.v128 = wasm_i32x4_shuffle(simde_int32x4_to_private(a).v128, simde_int32x4_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ + simde_int32x4_from_private(simde_vextq_s32_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s32(a, b, n) (__extension__ ({ \ simde_int32x4_private simde_vextq_s32_r_; \ @@ -603,16 +791,28 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_i64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_int64x2_from_private(r_); #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s64(a, b, n) simde_int64x2_from_m128i(_mm_alignr_epi8(simde_int64x2_to_m128i(b), simde_int64x2_to_m128i(a), n * sizeof(int64_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s64(a, b, n) (__extension__ ({ \ + simde_int64x2_private simde_vextq_s64_r_; \ + simde_vextq_s64_r_.v128 = wasm_i64x2_shuffle(simde_int64x2_to_private(a).v128, simde_int64x2_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ + simde_int64x2_from_private(simde_vextq_s64_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s64(a, b, n) (__extension__ ({ \ simde_int64x2_private simde_vextq_s64_r_; \ @@ -639,11 +839,16 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128, n, 16); + r_.sv128 = __riscv_vslideup_vx_u8m1(a_.sv128, b_.sv128, 16-n, 16); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + #endif return simde_uint8x16_from_private(r_); #endif } @@ -682,11 +887,16 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_u16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -730,11 +940,16 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_u32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -767,11 +982,16 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_u64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_uint64x2_from_private(r_); #endif } @@ -790,6 +1010,161 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) #define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vext_p8(simde_poly8x8_t a, simde_poly8x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8_t r; + SIMDE_CONSTIFY_8_(vext_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vext_p8 + #define vext_p8(a, b, n) simde_vext_p8((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4_t r; + SIMDE_CONSTIFY_4_(vext_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly16x4_private + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vext_p16 + #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vext_p64(simde_poly64x1_t a, simde_poly64x1_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 0) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + (void) n; + return vext_p64(a, b, 0); + #else + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src >= (sizeof(r_.values) / sizeof(r_.values[0]))) ? b_.values[src & 0] : a_.values[src]; + } + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vext_p64 + #define vext_p64(a, b, n) simde_vext_p64((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x16_t r; + SIMDE_CONSTIFY_16_(vextq_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vextq_p8 + #define vextq_p8(a, b, n) simde_vextq_p8((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x8_t r; + SIMDE_CONSTIFY_8_(vextq_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly16x8_private + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vextq_p16 + #define vextq_p16(a, b, n) simde_vextq_p16((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vextq_p64(simde_poly64x2_t a, simde_poly64x2_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 1) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + simde_poly64x2_t r; + SIMDE_CONSTIFY_2_(vextq_p64, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vextq_p64 + #define vextq_p64(a, b, n) simde_vextq_p64((a), (b), (n)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/fma.h b/arm/neon/fma.h index 4ee30d1d6..060e165e3 100644 --- a/arm/neon/fma.h +++ b/arm/neon/fma.h @@ -22,6 +22,8 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_FMA_H) @@ -34,16 +36,41 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vfmah_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmah_f16(a, b, c); + #else + return simde_vaddh_f16(a, simde_vmulh_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmah_f16 + #define vfmah_f16(a, b, c) simde_vfmah_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vadd_f32(a, simde_vmul_f32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_f32 #define vfma_f32(a, b, c) simde_vfma_f32(a, b, c) #endif @@ -51,26 +78,84 @@ simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vadd_f64(a, simde_vmul_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_f64 #define vfma_f64(a, b, c) simde_vfma_f64(a, b, c) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfma_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfma_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + c_ = simde_float16x4_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_float16x4_from_private(r_); + #else + return simde_vadd_f16(a, simde_vmul_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfma_f16 + #define vfma_f16(a, b, c) simde_vfma_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmaq_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + c_ = simde_float16x8_to_private(c); + + r_.sv128 = __riscv_vfmacc_vv_f16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_float16x8_from_private(r_); + #else + return simde_vaddq_f16(a, simde_vmulq_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmaq_f16 + #define vfmaq_f16(a, b, c) simde_vfmaq_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), @@ -79,6 +164,8 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128 = _mm_fmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); #endif return simde_float32x4_from_private(r_); @@ -86,7 +173,8 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { return simde_vaddq_f32(a, simde_vmulq_f32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_f32 #define vfmaq_f32(a, b, c) simde_vfmaq_f32(a, b, c) #endif @@ -94,12 +182,12 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f64(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), @@ -108,6 +196,8 @@ simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128d = _mm_fmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); #endif return simde_float64x2_from_private(r_); @@ -115,7 +205,8 @@ simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { return simde_vaddq_f64(a, simde_vmulq_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_f64 #define vfmaq_f64(a, b, c) simde_vfmaq_f64(a, b, c) #endif diff --git a/arm/neon/fma_lane.h b/arm/neon/fma_lane.h index 6100ed78c..54ae4d6ef 100644 --- a/arm/neon/fma_lane.h +++ b/arm/neon/fma_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_FMA_LANE_H) @@ -38,7 +39,7 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ /* simde_vfmad_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_lane_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_lane_f64(a, b, v, lane)) @@ -55,13 +56,14 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmad_lane_f64 #define vfmad_lane_f64(a, b, v, lane) simde_vfmad_lane_f64(a, b, v, lane) #endif /* simde_vfmad_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_laneq_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_laneq_f64(a, b, v, lane)) @@ -78,13 +80,62 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmad_laneq_f64 #define vfmad_laneq_f64(a, b, v, lane) simde_vfmad_laneq_f64(a, b, v, lane) #endif +/* simde_vfmah_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmah_lane_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_lane_f16(a, b, v, lane)) + #else + #define simde_vfmah_lane_f16(a, b, v, lane) vfmah_lane_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmah_lane_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vadd_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmah_lane_f16 + #define vfmah_lane_f16(a, b, v, lane) simde_vfmah_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmah_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmah_laneq_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_laneq_f16(a, b, v, lane)) + #else + #define simde_vfmah_laneq_f16(a, b, v, lane) vfmah_laneq_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmah_laneq_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vadd_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmah_laneq_f16 + #define vfmah_laneq_f16(a, b, v, lane) simde_vfmah_laneq_f16(a, b, v, lane) +#endif + /* simde_vfmas_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_lane_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_lane_f32(a, b, v, lane)) @@ -101,13 +152,14 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmas_lane_f32 #define vfmas_lane_f32(a, b, v, lane) simde_vfmas_lane_f32(a, b, v, lane) #endif /* simde_vfmas_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_laneq_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_laneq_f32(a, b, v, lane)) @@ -124,97 +176,155 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmas_laneq_f32 #define vfmas_laneq_f32(a, b, v, lane) simde_vfmas_laneq_f32(a, b, v, lane) #endif +/* simde_vfma_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfma_lane_f16(a, b, v, lane) vfma_lane_f16(a, b, v, lane) +#else + #define simde_vfma_lane_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfma_lane_f16 + #define vfma_lane_f16(a, b, v, lane) simde_vfma_lane_f16(a, b, v, lane) +#endif + /* simde_vfma_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f32(a, b, v, lane) vfma_lane_f32(a, b, v, lane) #else #define simde_vfma_lane_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_lane_f32 #define vfma_lane_f32(a, b, v, lane) simde_vfma_lane_f32(a, b, v, lane) #endif /* simde_vfma_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f64(a, b, v, lane) vfma_lane_f64((a), (b), (v), (lane)) #else #define simde_vfma_lane_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_lane_f64 #define vfma_lane_f64(a, b, v, lane) simde_vfma_lane_f64(a, b, v, lane) #endif +/* simde_vfma_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfma_laneq_f16(a, b, v, lane) vfma_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfma_laneq_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfma_laneq_f16 + #define vfma_laneq_f16(a, b, v, lane) simde_vfma_laneq_f16(a, b, v, lane) +#endif + /* simde_vfma_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f32(a, b, v, lane) vfma_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_laneq_f32 #define vfma_laneq_f32(a, b, v, lane) simde_vfma_laneq_f32(a, b, v, lane) #endif /* simde_vfma_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f64(a, b, v, lane) vfma_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_laneq_f64 #define vfma_laneq_f64(a, b, v, lane) simde_vfma_laneq_f64(a, b, v, lane) #endif /* simde_vfmaq_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f64(a, b, v, lane) vfmaq_lane_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f64(a, b, v, lane) simde_vaddq_f64(a, simde_vmulq_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_lane_f64 #define vfmaq_lane_f64(a, b, v, lane) simde_vfmaq_lane_f64(a, b, v, lane) #endif +/* simde_vfmaq_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmaq_lane_f16(a, b, v, lane) vfmaq_lane_f16((a), (b), (v), (lane)) +#else + #define simde_vfmaq_lane_f16(a, b, v, lane) simde_vaddq_f16(a, simde_vmulq_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmaq_lane_f16 + #define vfmaq_lane_f16(a, b, v, lane) simde_vfmaq_lane_f16(a, b, v, lane) +#endif + /* simde_vfmaq_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f32(a, b, v, lane) vfmaq_lane_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f32(a, b, v, lane) simde_vaddq_f32(a, simde_vmulq_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_lane_f32 #define vfmaq_lane_f32(a, b, v, lane) simde_vfmaq_lane_f32(a, b, v, lane) #endif +/* simde_vfmaq_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmaq_laneq_f16(a, b, v, lane) vfmaq_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfmaq_laneq_f16(a, b, v, lane) \ + simde_vaddq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmaq_laneq_f16 + #define vfmaq_laneq_f16(a, b, v, lane) simde_vfmaq_laneq_f16(a, b, v, lane) +#endif + /* simde_vfmaq_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f32(a, b, v, lane) \ simde_vaddq_f32(a, simde_vmulq_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_laneq_f32 #define vfmaq_laneq_f32(a, b, v, lane) simde_vfmaq_laneq_f32(a, b, v, lane) #endif /* simde_vfmaq_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f64(a, b, v, lane) vfmaq_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f64(a, b, v, lane) \ simde_vaddq_f64(a, simde_vmulq_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_laneq_f64 #define vfmaq_laneq_f64(a, b, v, lane) simde_vfmaq_laneq_f64(a, b, v, lane) #endif diff --git a/arm/neon/fma_n.h b/arm/neon/fma_n.h index 6cf58259c..e9afae87c 100644 --- a/arm/neon/fma_n.h +++ b/arm/neon/fma_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_FMA_N_H) @@ -35,16 +36,47 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfma_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfma_n_f16(a, b, c); + #else + return simde_vfma_f16(a, b, simde_vdup_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfma_n_f16 + #define vfma_n_f16(a, b, c) simde_vfma_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmaq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfmaq_n_f16(a, b, c); + #else + return simde_vfmaq_f16(a, b, simde_vdupq_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmaq_n_f16 + #define vfmaq_n_f16(a, b, c) simde_vfmaq_n_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfma_n_f32(a, b, c); #else return simde_vfma_f32(a, b, simde_vdup_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfma_n_f32 #define vfma_n_f32(a, b, c) simde_vfma_n_f32(a, b, c) #endif @@ -52,13 +84,14 @@ simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfma_n_f64(a, b, c); #else return simde_vfma_f64(a, b, simde_vdup_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfma_n_f64 #define vfma_n_f64(a, b, c) simde_vfma_n_f64(a, b, c) #endif @@ -66,13 +99,14 @@ simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfmaq_n_f32(a, b, c); #else return simde_vfmaq_f32(a, b, simde_vdupq_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfmaq_n_f32 #define vfmaq_n_f32(a, b, c) simde_vfmaq_n_f32(a, b, c) #endif @@ -80,13 +114,14 @@ simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfmaq_n_f64(a, b, c); #else return simde_vfmaq_f64(a, b, simde_vdupq_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfmaq_n_f64 #define vfmaq_n_f64(a, b, c) simde_vfmaq_n_f64(a, b, c) #endif diff --git a/arm/neon/fmlal.h b/arm/neon/fmlal.h new file mode 100644 index 000000000..8fa297e5d --- /dev/null +++ b/arm/neon/fmlal.h @@ -0,0 +1,557 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_FMLAL_H) +#define SIMDE_ARM_NEON_FMLAL_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlal_low_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_low_f16 + #define vfmlal_low_f16(r, a, b) simde_vfmlal_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlalq_low_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_low_f16 + #define vfmlalq_low_f16(r, a, b) simde_vfmlalq_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlal_high_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_high_f16 + #define vfmlal_high_f16(r, a, b) simde_vfmlal_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlalq_high_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_high_f16 + #define vfmlalq_high_f16(r, a, b) simde_vfmlalq_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_lane_low_f16(r, a, b, lane) vfmlal_lane_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_lane_low_f16 + #define vfmlal_lane_low_f16(r, a, b, lane) simde_vfmlal_lane_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_laneq_low_f16(r, a, b, lane) vfmlal_laneq_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_laneq_low_f16 + #define vfmlal_laneq_low_f16(r, a, b, lane) simde_vfmlal_laneq_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_lane_low_f16(r, a, b, lane) vfmlalq_lane_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_lane_low_f16 + #define vfmlalq_lane_low_f16(r, a, b, lane) simde_vfmlalq_lane_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_laneq_low_f16(r, a, b, lane) vfmlalq_laneq_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_laneq_low_f16 + #define vfmlalq_laneq_low_f16(r, a, b, lane) simde_vfmlalq_laneq_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_lane_high_f16(r, a, b, lane) vfmlal_lane_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_lane_high_f16 + #define vfmlal_lane_high_f16(r, a, b, lane) simde_vfmlal_lane_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_laneq_high_f16(r, a, b, lane) vfmlal_laneq_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlal_laneq_high_f16 + #define vfmlal_laneq_high_f16(r, a, b, lane) simde_vfmlal_laneq_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_lane_high_f16(r, a, b, lane) vfmlalq_lane_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_lane_high_f16 + #define vfmlalq_lane_high_f16(r, a, b, lane) simde_vfmlalq_lane_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_laneq_high_f16(r, a, b, lane) vfmlalq_laneq_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlalq_laneq_high_f16 + #define vfmlalq_laneq_high_f16(r, a, b, lane) simde_vfmlalq_laneq_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vbfmlalbq_f32(r, a, b); + #else + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[i * 2]); + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlalbq_f32 + #define vbfmlalbq_f32(r, a, b) simde_vbfmlalbq_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vbfmlaltq_f32(r, a, b); + #else + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[i * 2 + 1]); + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlaltq_f32 + #define vbfmlaltq_f32(r, a, b) simde_vbfmlaltq_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlalbq_lane_f32(r, a, b, lane) vbfmlalbq_lane_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlalbq_lane_f32 + #define vbfmlalbq_lane_f32(r, a, b, lane) simde_vbfmlalbq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlalbq_laneq_f32(r, a, b, lane) vbfmlalbq_laneq_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlalbq_laneq_f32 + #define vbfmlalbq_laneq_f32(r, a, b, lane) simde_vbfmlalbq_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlaltq_lane_f32(r, a, b, lane) vbfmlaltq_lane_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlaltq_lane_f32 + #define vbfmlaltq_lane_f32(r, a, b, lane) simde_vbfmlaltq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlaltq_laneq_f32(r, a, b, lane) vbfmlaltq_laneq_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmlaltq_laneq_f32 + #define vbfmlaltq_laneq_f32(r, a, b, lane) simde_vbfmlaltq_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMLAL_H) */ diff --git a/arm/neon/fmlsl.h b/arm/neon/fmlsl.h new file mode 100644 index 000000000..1517fafd9 --- /dev/null +++ b/arm/neon/fmlsl.h @@ -0,0 +1,397 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_FMLSL_H) +#define SIMDE_ARM_NEON_FMLSL_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlsl_low_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_low_f16 + #define vfmlsl_low_f16(r, a, b) simde_vfmlsl_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlslq_low_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_low_f16 + #define vfmlslq_low_f16(r, a, b) simde_vfmlslq_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlsl_high_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_high_f16 + #define vfmlsl_high_f16(r, a, b) simde_vfmlsl_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlslq_high_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_high_f16 + #define vfmlslq_high_f16(r, a, b) simde_vfmlslq_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_lane_low_f16(r, a, b, lane) vfmlsl_lane_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_lane_low_f16 + #define vfmlsl_lane_low_f16(r, a, b, lane) simde_vfmlsl_lane_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_laneq_low_f16(r, a, b, lane) vfmlsl_laneq_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_laneq_low_f16 + #define vfmlsl_laneq_low_f16(r, a, b, lane) simde_vfmlsl_laneq_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_lane_low_f16(r, a, b, lane) vfmlslq_lane_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_lane_low_f16 + #define vfmlslq_lane_low_f16(r, a, b, lane) simde_vfmlslq_lane_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_laneq_low_f16(r, a, b, lane) vfmlslq_laneq_low_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_laneq_low_f16 + #define vfmlslq_laneq_low_f16(r, a, b, lane) simde_vfmlslq_laneq_low_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_lane_high_f16(r, a, b, lane) vfmlsl_lane_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_lane_high_f16 + #define vfmlsl_lane_high_f16(r, a, b, lane) simde_vfmlsl_lane_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_laneq_high_f16(r, a, b, lane) vfmlsl_laneq_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlsl_laneq_high_f16 + #define vfmlsl_laneq_high_f16(r, a, b, lane) simde_vfmlsl_laneq_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_lane_high_f16(r, a, b, lane) vfmlslq_lane_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_lane_high_f16 + #define vfmlslq_lane_high_f16(r, a, b, lane) simde_vfmlslq_lane_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_laneq_high_f16(r, a, b, lane) vfmlslq_laneq_high_f16((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) + #undef vfmlslq_laneq_high_f16 + #define vfmlslq_laneq_high_f16(r, a, b, lane) simde_vfmlslq_laneq_high_f16((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMLSL_H) */ diff --git a/arm/neon/fms.h b/arm/neon/fms.h new file mode 100644 index 000000000..b1eb99c7d --- /dev/null +++ b/arm/neon/fms.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_H) +#define SIMDE_ARM_NEON_FMS_H + +#include "add.h" +#include "mul.h" +#include "neg.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vfmsh_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmsh_f16(a, b, c); + #else + return simde_vaddh_f16(a, simde_vnegh_f16(simde_vmulh_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsh_f16 + #define vfmsh_f16(a, b, c) simde_vfmsh_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfms_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfms_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); + #else + return simde_vadd_f32(a, simde_vneg_f32(simde_vmul_f32(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_f32 + #define vfms_f32(a, b, c) simde_vfms_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vfms_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfms_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); + #else + return simde_vadd_f64(a, simde_vneg_f64(simde_vmul_f64(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_f64 + #define vfms_f64(a, b, c) simde_vfms_f64(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfms_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfms_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + c_ = simde_float16x4_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_float16x4_from_private(r_); + #else + return simde_vadd_f16(a, simde_vneg_f16(simde_vmul_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfms_f16 + #define vfms_f16(a, b, c) simde_vfms_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmsq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmsq_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + c_ = simde_float16x8_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_float16x8_from_private(r_); + #else + return simde_vaddq_f16(a, simde_vnegq_f16(simde_vmulq_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsq_f16 + #define vfmsq_f16(a, b, c) simde_vfmsq_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfmsq_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b), + c_ = simde_float32x4_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_float32x4_from_private(r_); + #else + return simde_vaddq_f32(a, simde_vnegq_f32(simde_vmulq_f32(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_f32 + #define vfmsq_f32(a, b, c) simde_vfmsq_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vfmsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfmsq_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b), + c_ = simde_float64x2_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); + return simde_float64x2_from_private(r_); + #else + return simde_vaddq_f64(a, simde_vnegq_f64(simde_vmulq_f64(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_f64 + #define vfmsq_f64(a, b, c) simde_vfmsq_f64(a, b, c) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_H) */ diff --git a/arm/neon/fms_lane.h b/arm/neon/fms_lane.h new file mode 100644 index 000000000..d0f9f86c2 --- /dev/null +++ b/arm/neon/fms_lane.h @@ -0,0 +1,334 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_LANE_H) +#define SIMDE_ARM_NEON_FMS_LANE_H + +#include "sub.h" +#include "dup_n.h" +#include "get_lane.h" +#include "mul.h" +#include "mul_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +/* simde_vfmsd_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsd_lane_f64(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_lane_f64(a, b, v, lane)) + #else + #define simde_vfmsd_lane_f64(a, b, v, lane) vfmsd_lane_f64((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsd_lane_f64(a, b, v, lane) \ + simde_vget_lane_f64( \ + simde_vsub_f64( \ + simde_vdup_n_f64(a), \ + simde_vdup_n_f64(simde_vmuld_lane_f64(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsd_lane_f64 + #define vfmsd_lane_f64(a, b, v, lane) simde_vfmsd_lane_f64(a, b, v, lane) +#endif + +/* simde_vfmsd_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsd_laneq_f64(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_laneq_f64(a, b, v, lane)) + #else + #define simde_vfmsd_laneq_f64(a, b, v, lane) vfmsd_laneq_f64((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsd_laneq_f64(a, b, v, lane) \ + simde_vget_lane_f64( \ + simde_vsub_f64( \ + simde_vdup_n_f64(a), \ + simde_vdup_n_f64(simde_vmuld_laneq_f64(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsd_laneq_f64 + #define vfmsd_laneq_f64(a, b, v, lane) simde_vfmsd_laneq_f64(a, b, v, lane) +#endif + +/* simde_vfmsh_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsh_lane_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_lane_f16(a, b, v, lane)) + #else + #define simde_vfmsh_lane_f16(a, b, v, lane) vfmsh_lane_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsh_lane_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vsub_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsh_lane_f16 + #define vfmsh_lane_f16(a, b, v, lane) simde_vfmsh_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmsh_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsh_laneq_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_laneq_f16(a, b, v, lane)) + #else + #define simde_vfmsh_laneq_f16(a, b, v, lane) vfmsh_laneq_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsh_laneq_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vsub_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsh_laneq_f16 + #define vfmsh_laneq_f16(a, b, v, lane) simde_vfmsh_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfmss_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmss_lane_f32(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_lane_f32(a, b, v, lane)) + #else + #define simde_vfmss_lane_f32(a, b, v, lane) vfmss_lane_f32((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmss_lane_f32(a, b, v, lane) \ + simde_vget_lane_f32( \ + simde_vsub_f32( \ + simde_vdup_n_f32(a), \ + simde_vdup_n_f32(simde_vmuls_lane_f32(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmss_lane_f32 + #define vfmss_lane_f32(a, b, v, lane) simde_vfmss_lane_f32(a, b, v, lane) +#endif + +/* simde_vfmss_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmss_laneq_f32(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_laneq_f32(a, b, v, lane)) + #else + #define simde_vfmss_laneq_f32(a, b, v, lane) vfmss_laneq_f32((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmss_laneq_f32(a, b, v, lane) \ + simde_vget_lane_f32( \ + simde_vsub_f32( \ + simde_vdup_n_f32(a), \ + simde_vdup_n_f32(simde_vmuls_laneq_f32(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmss_laneq_f32 + #define vfmss_laneq_f32(a, b, v, lane) simde_vfmss_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfms_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfms_lane_f16(a, b, v, lane) vfms_lane_f16(a, b, v, lane) +#else + #define simde_vfms_lane_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfms_lane_f16 + #define vfms_lane_f16(a, b, v, lane) simde_vfms_lane_f16(a, b, v, lane) +#endif + +/* simde_vfms_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_lane_f32(a, b, v, lane) vfms_lane_f32(a, b, v, lane) +#else + #define simde_vfms_lane_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_lane_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_lane_f32 + #define vfms_lane_f32(a, b, v, lane) simde_vfms_lane_f32(a, b, v, lane) +#endif + +/* simde_vfms_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_lane_f64(a, b, v, lane) vfms_lane_f64((a), (b), (v), (lane)) +#else + #define simde_vfms_lane_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_lane_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_lane_f64 + #define vfms_lane_f64(a, b, v, lane) simde_vfms_lane_f64(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfms_laneq_f16(a, b, v, lane) vfms_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfms_laneq_f16 + #define vfms_laneq_f16(a, b, v, lane) simde_vfms_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_laneq_f32(a, b, v, lane) vfms_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_laneq_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_laneq_f32 + #define vfms_laneq_f32(a, b, v, lane) simde_vfms_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_laneq_f64(a, b, v, lane) vfms_laneq_f64((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_laneq_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfms_laneq_f64 + #define vfms_laneq_f64(a, b, v, lane) simde_vfms_laneq_f64(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_lane_f64(a, b, v, lane) vfmsq_lane_f64((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f64(a, b, v, lane) simde_vsubq_f64(a, simde_vmulq_lane_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_lane_f64 + #define vfmsq_lane_f64(a, b, v, lane) simde_vfmsq_lane_f64(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmsq_lane_f16(a, b, v, lane) vfmsq_lane_f16((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f16(a, b, v, lane) simde_vsubq_f16(a, simde_vmulq_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsq_lane_f16 + #define vfmsq_lane_f16(a, b, v, lane) simde_vfmsq_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_lane_f32(a, b, v, lane) vfmsq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f32(a, b, v, lane) simde_vsubq_f32(a, simde_vmulq_lane_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_lane_f32 + #define vfmsq_lane_f32(a, b, v, lane) simde_vfmsq_lane_f32(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmsq_laneq_f16(a, b, v, lane) vfmsq_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f16(a, b, v, lane) \ + simde_vsubq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsq_laneq_f16 + #define vfmsq_laneq_f16(a, b, v, lane) simde_vfmsq_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_laneq_f32(a, b, v, lane) vfmsq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f32(a, b, v, lane) \ + simde_vsubq_f32(a, simde_vmulq_laneq_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_laneq_f32 + #define vfmsq_laneq_f32(a, b, v, lane) simde_vfmsq_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_laneq_f64(a, b, v, lane) vfmsq_laneq_f64((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f64(a, b, v, lane) \ + simde_vsubq_f64(a, simde_vmulq_laneq_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) + #undef vfmsq_laneq_f64 + #define vfmsq_laneq_f64(a, b, v, lane) simde_vfmsq_laneq_f64(a, b, v, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_LANE_H) */ diff --git a/arm/neon/fms_n.h b/arm/neon/fms_n.h new file mode 100644 index 000000000..bf2663d20 --- /dev/null +++ b/arm/neon/fms_n.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_N_H) +#define SIMDE_ARM_NEON_FMS_N_H + +#include "types.h" +#include "dup_n.h" +#include "fms.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfms_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfms_n_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_float16x4_from_private(r_); + #else + return simde_vfms_f16(a, b, simde_vdup_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfms_n_f16 + #define vfms_n_f16(a, b, c) simde_vfms_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmsq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfmsq_n_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_float16x8_from_private(r_); + #else + return simde_vfmsq_f16(a, b, simde_vdupq_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vfmsq_n_f16 + #define vfmsq_n_f16(a, b, c) simde_vfmsq_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfms_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + return vfms_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_float32x2_from_private(r_); + #else + return simde_vfms_f32(a, b, simde_vdup_n_f32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) + #undef vfms_n_f32 + #define vfms_n_f32(a, b, c) simde_vfms_n_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vfms_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vfms_n_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f64m1(a_.sv64 , c , b_.sv64 , 1); + return simde_float64x1_from_private(r_); + #else + return simde_vfms_f64(a, b, simde_vdup_n_f64(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vfms_n_f64 + #define vfms_n_f64(a, b, c) simde_vfms_n_f64(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + return vfmsq_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_float32x4_from_private(r_); + #else + return simde_vfmsq_f32(a, b, simde_vdupq_n_f32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) + #undef vfmsq_n_f32 + #define vfmsq_n_f32(a, b, c) simde_vfmsq_n_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vfmsq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vfmsq_n_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f64m1(a_.sv128 , c , b_.sv128 , 2); + return simde_float64x2_from_private(r_); + #else + return simde_vfmsq_f64(a, b, simde_vdupq_n_f64(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vfmsq_n_f64 + #define vfmsq_n_f64(a, b, c) simde_vfmsq_n_f64(a, b, c) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_N_H) */ diff --git a/arm/neon/get_high.h b/arm/neon/get_high.h index 654c63bd6..ff8f537ef 100644 --- a/arm/neon/get_high.h +++ b/arm/neon/get_high.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_GET_HIGH_H) @@ -34,6 +36,31 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vget_high_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vget_high_f16(a); + #else + simde_float16x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vslidedown_vx_f16m1(a_.sv128 , 4 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vget_high_f16 + #define vget_high_f16(a) simde_vget_high_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vget_high_f32(simde_float32x4_t a) { @@ -43,7 +70,9 @@ simde_vget_high_f32(simde_float32x4_t a) { simde_float32x2_private r_; simde_float32x4_private a_ = simde_float32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_f32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -69,7 +98,9 @@ simde_vget_high_f64(simde_float64x2_t a) { simde_float64x1_private r_; simde_float64x2_private a_ = simde_float64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_f64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE @@ -95,7 +126,9 @@ simde_vget_high_s8(simde_int8x16_t a) { simde_int8x8_private r_; simde_int8x16_private a_ = simde_int8x16_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 8, 9, 10, 11, 12, 13, 14, 15); #else SIMDE_VECTORIZE @@ -121,7 +154,9 @@ simde_vget_high_s16(simde_int16x8_t a) { simde_int16x4_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 4, 5, 6, 7); #else SIMDE_VECTORIZE @@ -147,7 +182,9 @@ simde_vget_high_s32(simde_int32x4_t a) { simde_int32x2_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -173,7 +210,9 @@ simde_vget_high_s64(simde_int64x2_t a) { simde_int64x1_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE @@ -199,7 +238,9 @@ simde_vget_high_u8(simde_uint8x16_t a) { simde_uint8x8_private r_; simde_uint8x16_private a_ = simde_uint8x16_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 8, 9, 10, 11, 12, 13, 14,15); #else SIMDE_VECTORIZE @@ -225,7 +266,9 @@ simde_vget_high_u16(simde_uint16x8_t a) { simde_uint16x4_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 4, 5, 6, 7); #else SIMDE_VECTORIZE @@ -251,7 +294,9 @@ simde_vget_high_u32(simde_uint32x4_t a) { simde_uint32x2_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -277,7 +322,9 @@ simde_vget_high_u64(simde_uint64x2_t a) { simde_uint64x1_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE @@ -294,6 +341,94 @@ simde_vget_high_u64(simde_uint64x2_t a) { #define vget_high_u64(a) simde_vget_high_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vget_high_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_high_p8(a); + #else + simde_poly8x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_high_p8 + #define vget_high_p8(a) simde_vget_high_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vget_high_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_high_p16(a); + #else + simde_poly16x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_high_p16 + #define vget_high_p16(a) simde_vget_high_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vget_high_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vget_high_p64(a); + #else + simde_poly64x1_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_high_p64 + #define vget_high_p64(a) simde_vget_high_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vget_high_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vget_high_bf16(a); + #else + simde_bfloat16x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_high_bf16 + #define vget_high_bf16(a) simde_vget_high_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/get_lane.h b/arm/neon/get_lane.h index 2dbeb55c6..d19dd9847 100644 --- a/arm/neon/get_lane.h +++ b/arm/neon/get_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_GET_LANE_H) @@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vget_lane_f16(simde_float16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_(vget_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v); + #else + simde_float16x4_private v_ = simde_float16x4_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vget_lane_f16 + #define vget_lane_f16(v, lane) simde_vget_lane_f16((v), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vget_lane_f32(simde_float32x2_t v, const int lane) @@ -247,6 +270,28 @@ simde_vget_lane_u64(simde_uint64x1_t v, const int lane) #define vget_lane_u64(v, lane) simde_vget_lane_u64((v), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vgetq_lane_f16(simde_float16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16_t r; + + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_(vgetq_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v); + #else + simde_float16x8_private v_ = simde_float16x8_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vgetq_lane_f16 + #define vgetq_lane_f16(v, lane) simde_vgetq_lane_f16((v), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vgetq_lane_f32(simde_float32x4_t v, const int lane) @@ -513,6 +558,169 @@ simde_vgetq_lane_u64(simde_uint64x2_t v, const int lane) #define vgetq_lane_u64(v, lane) simde_vgetq_lane_u64((v), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vget_lane_p8(simde_poly8x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8_t r; + simde_poly8x8_private v_ = simde_poly8x8_to_private(v); + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p8(v, lane) vget_lane_p8((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vget_lane_p8 + #define vget_lane_p8(v, lane) simde_vget_lane_p8((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vget_lane_p16(simde_poly16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16_t r; + simde_poly16x4_private v_ = simde_poly16x4_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p16(v, lane) vget_lane_p16((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vget_lane_p16 + #define vget_lane_p16(v, lane) simde_vget_lane_p16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64_t +simde_vget_lane_p64(simde_poly64x1_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64_t r; + simde_poly64x1_private v_ = simde_poly64x1_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p64(v, lane) vget_lane_p64((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vget_lane_p64 + #define vget_lane_p64(v, lane) simde_vget_lane_p64((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vgetq_lane_p8(simde_poly8x16_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8_t r; + simde_poly8x16_private v_ = simde_poly8x16_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p8(v, lane) vgetq_lane_p8((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vgetq_lane_p8 + #define vgetq_lane_p8(v, lane) simde_vgetq_lane_p8((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vgetq_lane_p16(simde_poly16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16_t r; + simde_poly16x8_private v_ = simde_poly16x8_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p16(v, lane) vgetq_lane_p16((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vgetq_lane_p16 + #define vgetq_lane_p16(v, lane) simde_vgetq_lane_p16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64_t +simde_vgetq_lane_p64(simde_poly64x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64_t r; + simde_poly64x2_private v_ = simde_poly64x2_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p64(v, lane) vgetq_lane_p64((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vgetq_lane_p64 + #define vgetq_lane_p64(v, lane) simde_vgetq_lane_p64((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vget_lane_bf16(simde_bfloat16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vget_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v); + #else + simde_bfloat16x4_private v_ = simde_bfloat16x4_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vget_lane_bf16 + #define vget_lane_bf16(v, lane) simde_vget_lane_bf16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vgetq_lane_bf16(simde_bfloat16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_(vgetq_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v); + #else + simde_bfloat16x8_private v_ = simde_bfloat16x8_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vgetq_lane_bf16 + #define vgetq_lane_bf16(v, lane) simde_vgetq_lane_bf16((v), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/get_low.h b/arm/neon/get_low.h index 84e17783c..36fae5890 100644 --- a/arm/neon/get_low.h +++ b/arm/neon/get_low.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_GET_LOW_H) @@ -34,6 +36,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vget_low_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vget_low_f16(a); + #else + simde_float16x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = a_.sv128; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vget_low_f16 + #define vget_low_f16(a) simde_vget_low_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vget_low_f32(simde_float32x4_t a) { @@ -43,7 +72,9 @@ simde_vget_low_f32(simde_float32x4_t a) { simde_float32x2_private r_; simde_float32x4_private a_ = simde_float32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); #else SIMDE_VECTORIZE @@ -69,7 +100,9 @@ simde_vget_low_f64(simde_float64x2_t a) { simde_float64x1_private r_; simde_float64x2_private a_ = simde_float64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); #else SIMDE_VECTORIZE @@ -97,6 +130,8 @@ simde_vget_low_s8(simde_int8x16_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3, 4, 5, 6, 7); @@ -127,6 +162,8 @@ simde_vget_low_s16(simde_int16x8_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3); @@ -157,6 +194,8 @@ simde_vget_low_s32(simde_int32x4_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); @@ -187,6 +226,8 @@ simde_vget_low_s64(simde_int64x2_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); @@ -217,6 +258,8 @@ simde_vget_low_u8(simde_uint8x16_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3, 4, 5, 6, 7); @@ -247,6 +290,8 @@ simde_vget_low_u16(simde_uint16x8_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3); @@ -277,6 +322,8 @@ simde_vget_low_u32(simde_uint32x4_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); @@ -307,6 +354,8 @@ simde_vget_low_u64(simde_uint64x2_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); @@ -326,6 +375,95 @@ simde_vget_low_u64(simde_uint64x2_t a) { #define vget_low_u64(a) simde_vget_low_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vget_low_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_low_p8(a); + #else + simde_poly8x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_low_p8 + #define vget_low_p8(a) simde_vget_low_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vget_low_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_low_p16(a); + #else + simde_poly16x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_low_p16 + #define vget_low_p16(a) simde_vget_low_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vget_low_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vget_low_p64(a); + #else + simde_poly64x1_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_low_p64 + #define vget_low_p64(a) simde_vget_low_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vget_low_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vget_low_bf16(a); + #else + simde_bfloat16x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vget_low_bf16 + #define vget_low_bf16(a) simde_vget_low_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/hadd.h b/arm/neon/hadd.h index 53e26d716..7e72ba3f7 100644 --- a/arm/neon/hadd.h +++ b/arm/neon/hadd.h @@ -46,6 +46,14 @@ simde_int8x8_t simde_vhadd_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vshrq_n_s16(simde_vaddl_s8(a, b), 1)); #endif @@ -60,6 +68,14 @@ simde_int16x4_t simde_vhadd_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vshrq_n_s32(simde_vaddl_s16(a, b), 1)); #endif @@ -74,6 +90,14 @@ simde_int32x2_t simde_vhadd_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vshrq_n_s64(simde_vaddl_s32(a, b), 1)); #endif @@ -88,6 +112,14 @@ simde_uint8x8_t simde_vhadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16(simde_vshrq_n_u16(simde_vaddl_u8(a, b), 1)); #endif @@ -102,6 +134,14 @@ simde_uint16x4_t simde_vhadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32(simde_vshrq_n_u32(simde_vaddl_u16(a, b), 1)); #endif @@ -116,6 +156,14 @@ simde_uint32x2_t simde_vhadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64(simde_vshrq_n_u64(simde_vaddl_u32(a, b), 1)); #endif @@ -138,6 +186,8 @@ simde_vhaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_add_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,6 +216,8 @@ simde_vhaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_add_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -194,6 +246,8 @@ simde_vhaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_add_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -233,6 +287,8 @@ simde_vhaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { 1); r_.v128 = wasm_i8x16_shuffle(lo, hi, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -261,6 +317,8 @@ simde_vhaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_add_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +347,8 @@ simde_vhaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_add_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/hsub.h b/arm/neon/hsub.h index d8e7e02fb..17c563b95 100644 --- a/arm/neon/hsub.h +++ b/arm/neon/hsub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* TODO: the 128-bit versions only require AVX-512 because of the final @@ -46,6 +47,14 @@ simde_int8x8_t simde_vhsub_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i8m1(a_.sv64, b_.sv64, 2, 8); + return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vshrq_n_s16(simde_vsubl_s8(a, b), 1)); #endif @@ -60,6 +69,14 @@ simde_int16x4_t simde_vhsub_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i16m1(a_.sv64, b_.sv64, 2, 4); + return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vshrq_n_s32(simde_vsubl_s16(a, b), 1)); #endif @@ -74,6 +91,14 @@ simde_int32x2_t simde_vhsub_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i32m1(a_.sv64, b_.sv64, 2, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vshrq_n_s64(simde_vsubl_s32(a, b), 1)); #endif @@ -88,6 +113,14 @@ simde_uint8x8_t simde_vhsub_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u8m1(a_.sv64, b_.sv64, 2, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16(simde_vshrq_n_u16(simde_vsubl_u8(a, b), 1)); #endif @@ -102,6 +135,14 @@ simde_uint16x4_t simde_vhsub_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u16m1(a_.sv64, b_.sv64, 2, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32(simde_vshrq_n_u32(simde_vsubl_u16(a, b), 1)); #endif @@ -116,6 +157,14 @@ simde_uint32x2_t simde_vhsub_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u32m1(a_.sv64, b_.sv64, 2, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64(simde_vshrq_n_u64(simde_vsubl_u32(a, b), 1)); #endif @@ -138,6 +187,8 @@ simde_vhsubq_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_sub_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i8m1(a_.sv128, b_.sv128, 2, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,6 +217,8 @@ simde_vhsubq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_sub_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i16m1(a_.sv128, b_.sv128, 2, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -194,6 +247,8 @@ simde_vhsubq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_sub_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i32m1(a_.sv128, b_.sv128, 2, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -222,6 +277,8 @@ simde_vhsubq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srli_epi16(_mm256_sub_epi16(_mm256_cvtepu8_epi16(a_.m128i), _mm256_cvtepu8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u8m1(a_.sv128, b_.sv128, 2, 16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t lo = wasm_u16x8_shr(wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(a_.v128), @@ -261,6 +318,8 @@ simde_vhsubq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_sub_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u16m1(a_.sv128, b_.sv128, 2, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +348,8 @@ simde_vhsubq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_sub_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u32m1(a_.sv128, b_.sv128, 2, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/ld1.h b/arm/neon/ld1.h index 2fa8d1f56..29f08fe85 100644 --- a/arm/neon/ld1.h +++ b/arm/neon/ld1.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_H) @@ -36,16 +38,20 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t -simde_vld1_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld1_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld1_f16(ptr); #else simde_float16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1_f16 #define vld1_f16(a) simde_vld1_f16((a)) #endif @@ -57,7 +63,11 @@ simde_vld1_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_f32(ptr); #else simde_float32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_f32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float32x2_from_private(r_); #endif } @@ -73,7 +83,11 @@ simde_vld1_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_f64(ptr); #else simde_float64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_f64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float64x1_from_private(r_); #endif } @@ -89,7 +103,11 @@ simde_vld1_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_s8(ptr); #else simde_int8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_i8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int8x8_from_private(r_); #endif } @@ -105,7 +123,11 @@ simde_vld1_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_s16(ptr); #else simde_int16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_i16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int16x4_from_private(r_); #endif } @@ -121,7 +143,11 @@ simde_vld1_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_s32(ptr); #else simde_int32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_i32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int32x2_from_private(r_); #endif } @@ -137,7 +163,11 @@ simde_vld1_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_s64(ptr); #else simde_int64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_i64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int64x1_from_private(r_); #endif } @@ -153,7 +183,11 @@ simde_vld1_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_u8(ptr); #else simde_uint8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint8x8_from_private(r_); #endif } @@ -169,7 +203,11 @@ simde_vld1_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_u16(ptr); #else simde_uint16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint16x4_from_private(r_); #endif } @@ -185,7 +223,11 @@ simde_vld1_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_u32(ptr); #else simde_uint32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_u32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint32x2_from_private(r_); #endif } @@ -201,7 +243,11 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_u64(ptr); #else simde_uint64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint64x1_from_private(r_); #endif } @@ -212,20 +258,22 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t -simde_vld1q_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(8)]) { +simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld1q_f16(ptr); #else simde_float16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1q_f16 #define vld1q_f16(a) simde_vld1q_f16((a)) #endif @@ -239,6 +287,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -259,6 +309,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_float64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -279,6 +331,8 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_int8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -299,6 +353,8 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_int16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -319,6 +375,8 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_int32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -339,6 +397,8 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_int64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -359,6 +419,8 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_uint8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -370,82 +432,6 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #define vld1q_u8(a) simde_vld1q_u8((a)) #endif -#if !defined(SIMDE_BUG_INTEL_857088) - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x2_t -simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x2(ptr); - #else - simde_uint8x16_private a_[2]; - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x2 - #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x3_t -simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x3(ptr); - #else - simde_uint8x16_private a_[3]; - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x3 - #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x4_t -simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x4(ptr); - #else - simde_uint8x16_private a_[4]; - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]), - simde_uint8x16_from_private(a_[3]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x4 - #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a)) -#endif - -#endif /* !defined(SIMDE_BUG_INTEL_857088) */ - SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { @@ -455,6 +441,8 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_uint16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -475,6 +463,8 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_uint32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -495,6 +485,8 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_uint64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -506,6 +498,177 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #define vld1q_u64(a) simde_vld1q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_p8(ptr); + #else + simde_poly8x8_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p8 + #define vld1_p8(a) simde_vld1_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_p16(ptr); + #else + simde_poly16x4_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p16 + #define vld1_p16(a) simde_vld1_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1_p64(ptr); + #else + simde_poly64x1_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_p64 + #define vld1_p64(a) simde_vld1_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_p8(ptr); + #else + simde_poly8x16_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p8 + #define vld1q_p8(a) simde_vld1q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_p16(ptr); + #else + simde_poly16x8_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p16 + #define vld1q_p16(a) simde_vld1q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_p64(ptr); + #else + simde_poly64x2_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_p64 + #define vld1q_p64(a) simde_vld1q_p64((a)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vldrq_p128(simde_poly128_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vldrq_p128(ptr); + #else + simde_poly128_t r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_CRYPTO)) + #undef vldrq_p128 + #define vldrq_p128(a) simde_vldrq_p128((a)) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vld1_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16(ptr); + #else + simde_bfloat16x4_private r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_BF16)) + #undef vld1_bf16 + #define vld1_bf16(a) simde_vld1_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vld1q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16(ptr); + #else + simde_bfloat16x8_private r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_BF16)) + #undef vld1q_bf16 + #define vld1q_bf16(a) simde_vld1q_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/ld1_dup.h b/arm/neon/ld1_dup.h index 9df7477b7..e9cf43239 100644 --- a/arm/neon/ld1_dup.h +++ b/arm/neon/ld1_dup.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_LD1_DUP_H) @@ -35,6 +36,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vld1_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld1_dup_f16(ptr); + #else + return simde_vdup_n_f16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vld1_dup_f16 + #define vld1_dup_f16(a) simde_vld1_dup_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vld1_dup_f32(simde_float32 const * ptr) { @@ -177,6 +192,20 @@ simde_vld1_dup_u64(uint64_t const * ptr) { #define vld1_dup_u64(a) simde_vld1_dup_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vld1q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld1q_dup_f16(ptr); + #else + return simde_vdupq_n_f16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vld1q_dup_f16 + #define vld1q_dup_f16(a) simde_vld1q_dup_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vld1q_dup_f32(simde_float32 const * ptr) { @@ -401,6 +430,118 @@ simde_vld1q_dup_u64(uint64_t const * ptr) { #define vld1q_dup_u64(a) simde_vld1q_dup_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_dup_p8(ptr); + #else + return simde_vdup_n_p8(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p8 + #define vld1_dup_p8(a) simde_vld1_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_dup_p16(ptr); + #else + return simde_vdup_n_p16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p16 + #define vld1_dup_p16(a) simde_vld1_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1_dup_p64(ptr); + #else + return simde_vdup_n_p64(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p64 + #define vld1_dup_p64(a) simde_vld1_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_dup_p8(ptr); + #else + return simde_vdupq_n_p8(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p8 + #define vld1q_dup_p8(a) simde_vld1q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_dup_p16(ptr); + #else + return simde_vdupq_n_p16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p16 + #define vld1q_dup_p16(a) simde_vld1q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_dup_p64(ptr); + #else + return simde_vdupq_n_p64(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p64 + #define vld1q_dup_p64(a) simde_vld1q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vld1_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_dup_bf16(ptr); + #else + return simde_vdup_n_bf16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_bf16 + #define vld1_dup_bf16(a) simde_vld1_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vld1q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_dup_bf16(ptr); + #else + return simde_vdupq_n_bf16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_bf16 + #define vld1q_dup_bf16(a) simde_vld1q_dup_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/ld1_lane.h b/arm/neon/ld1_lane.h index 4e36caf52..961a67209 100644 --- a/arm/neon/ld1_lane.h +++ b/arm/neon/ld1_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_LD1_LANE_H) @@ -161,6 +162,22 @@ simde_uint64x1_t simde_vld1_lane_u64(uint64_t const *ptr, simde_uint64x1_t src, #define vld1_lane_u64(ptr, src, lane) simde_vld1_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vld1_lane_f16(simde_float16_t const *ptr, simde_float16x4_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private r = simde_float16x4_to_private(src); + r.values[lane] = *ptr; + return simde_float16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld1_lane_f16(ptr, src, lane) vld1_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vld1_lane_f16 + #define vld1_lane_f16(ptr, src, lane) simde_vld1_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vld1_lane_f32(simde_float32_t const *ptr, simde_float32x2_t src, const int lane) @@ -321,6 +338,22 @@ simde_uint64x2_t simde_vld1q_lane_u64(uint64_t const *ptr, simde_uint64x2_t src, #define vld1q_lane_u64(ptr, src, lane) simde_vld1q_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vld1q_lane_f16(simde_float16_t const *ptr, simde_float16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private r = simde_float16x8_to_private(src); + r.values[lane] = *ptr; + return simde_float16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld1q_lane_f16(ptr, src, lane) vld1q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vld1q_lane_f16 + #define vld1q_lane_f16(ptr, src, lane) simde_vld1q_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vld1q_lane_f32(simde_float32_t const *ptr, simde_float32x4_t src, const int lane) @@ -353,6 +386,141 @@ simde_float64x2_t simde_vld1q_lane_f64(simde_float64_t const *ptr, simde_float64 #define vld1q_lane_f64(ptr, src, lane) simde_vld1q_lane_f64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_lane_p8(simde_poly8_t const *ptr, simde_poly8x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8_private r = simde_poly8x8_to_private(src); + r.values[lane] = *ptr; + return simde_poly8x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1_lane_p8(ptr, src, lane) vld1_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p8 + #define vld1_lane_p8(ptr, src, lane) simde_vld1_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_lane_p16(simde_poly16_t const *ptr, simde_poly16x4_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4_private r = simde_poly16x4_to_private(src); + r.values[lane] = *ptr; + return simde_poly16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1_lane_p16(ptr, src, lane) vld1_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p16 + #define vld1_lane_p16(ptr, src, lane) simde_vld1_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_lane_p64(simde_poly64_t const *ptr, simde_poly64x1_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1_private r = simde_poly64x1_to_private(src); + r.values[lane] = *ptr; + return simde_poly64x1_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vld1_lane_p64(ptr, src, lane) vld1_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p64 + #define vld1_lane_p64(ptr, src, lane) simde_vld1_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_lane_p8(simde_poly8_t const *ptr, simde_poly8x16_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16_private r = simde_poly8x16_to_private(src); + r.values[lane] = *ptr; + return simde_poly8x16_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1q_lane_p8(ptr, src, lane) vld1q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p8 + #define vld1q_lane_p8(ptr, src, lane) simde_vld1q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_lane_p16(simde_poly16_t const *ptr, simde_poly16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8_private r = simde_poly16x8_to_private(src); + r.values[lane] = *ptr; + return simde_poly16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1q_lane_p16(ptr, src, lane) vld1q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p16 + #define vld1q_lane_p16(ptr, src, lane) simde_vld1q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_lane_p64(simde_poly64_t const *ptr, simde_poly64x2_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2_private r = simde_poly64x2_to_private(src); + r.values[lane] = *ptr; + return simde_poly64x2_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vld1q_lane_p64(ptr, src, lane) vld1q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p64 + #define vld1q_lane_p64(ptr, src, lane) simde_vld1q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t simde_vld1_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4_private r = simde_bfloat16x4_to_private(src); + r.values[lane] = *ptr; + return simde_bfloat16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld1_lane_bf16(ptr, src, lane) vld1_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1_lane_bf16 + #define vld1_lane_bf16(ptr, src, lane) simde_vld1_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t simde_vld1q_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8_private r = simde_bfloat16x8_to_private(src); + r.values[lane] = *ptr; + return simde_bfloat16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld1q_lane_bf16(ptr, src, lane) vld1q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1q_lane_bf16 + #define vld1q_lane_bf16(ptr, src, lane) simde_vld1q_lane_bf16((ptr), (src), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/ld1_x2.h b/arm/neon/ld1_x2.h new file mode 100644 index 000000000..04af43955 --- /dev/null +++ b/arm/neon/ld1_x2.h @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X2_H) +#define SIMDE_ARM_NEON_LD1_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld1_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x2(ptr); + #else + simde_float16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x2_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f16_x2 + #define vld1_f16_x2(a) simde_vld1_f16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t +simde_vld1_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x2(ptr); + #else + simde_float32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x2_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f32_x2 + #define vld1_f32_x2(a) simde_vld1_f32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t +simde_vld1_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x2(ptr); + #else + simde_float64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x2_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1_f64_x2 + #define vld1_f64_x2(a) simde_vld1_f64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t +simde_vld1_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x2(ptr); + #else + simde_int8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x2_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s8_x2 + #define vld1_s8_x2(a) simde_vld1_s8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t +simde_vld1_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x2(ptr); + #else + simde_int16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x2_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s16_x2 + #define vld1_s16_x2(a) simde_vld1_s16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t +simde_vld1_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x2(ptr); + #else + simde_int32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x2_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s32_x2 + #define vld1_s32_x2(a) simde_vld1_s32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t +simde_vld1_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x2(ptr); + #else + simde_int64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x2_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s64_x2 + #define vld1_s64_x2(a) simde_vld1_s64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t +simde_vld1_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x2(ptr); + #else + simde_uint8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x2_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u8_x2 + #define vld1_u8_x2(a) simde_vld1_u8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t +simde_vld1_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x2(ptr); + #else + simde_uint16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x2_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u16_x2 + #define vld1_u16_x2(a) simde_vld1_u16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t +simde_vld1_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x2(ptr); + #else + simde_uint32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x2_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u32_x2 + #define vld1_u32_x2(a) simde_vld1_u32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t +simde_vld1_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x2(ptr); + #else + simde_uint64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x2_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u64_x2 + #define vld1_u64_x2(a) simde_vld1_u64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld1_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld1_p8_x2(ptr); + #else + simde_poly8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x2_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld1_p8_x2 + #define vld1_p8_x2(a) simde_vld1_p8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld1_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld1_p16_x2(ptr); + #else + simde_poly16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x2_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld1_p16_x2 + #define vld1_p16_x2(a) simde_vld1_p16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x2(ptr); + #else + simde_poly64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x2_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_p64_x2 + #define vld1_p64_x2(a) simde_vld1_p64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld1_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x2(ptr); + #else + simde_bfloat16x4_private a_[2]; + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x2_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1_bf16_x2 + #define vld1_bf16_x2(a) simde_vld1_bf16_x2((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X2_H) */ diff --git a/arm/neon/ld1_x3.h b/arm/neon/ld1_x3.h new file mode 100644 index 000000000..ad96b19ca --- /dev/null +++ b/arm/neon/ld1_x3.h @@ -0,0 +1,514 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X3_H) +#define SIMDE_ARM_NEON_LD1_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld1_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x3(ptr); + #else + simde_float16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x3_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f16_x3 + #define vld1_f16_x3(a) simde_vld1_f16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t +simde_vld1_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x3(ptr); + #else + simde_float32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x3_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]), + simde_float32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f32_x3 + #define vld1_f32_x3(a) simde_vld1_f32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t +simde_vld1_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x3(ptr); + #else + simde_float64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x3_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]), + simde_float64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1_f64_x3 + #define vld1_f64_x3(a) simde_vld1_f64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t +simde_vld1_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x3(ptr); + #else + simde_int8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x3_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + simde_int8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s8_x3 + #define vld1_s8_x3(a) simde_vld1_s8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t +simde_vld1_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x3(ptr); + #else + simde_int16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x3_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + simde_int16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s16_x3 + #define vld1_s16_x3(a) simde_vld1_s16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t +simde_vld1_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x3(ptr); + #else + simde_int32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x3_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + simde_int32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s32_x3 + #define vld1_s32_x3(a) simde_vld1_s32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t +simde_vld1_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x3(ptr); + #else + simde_int64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x3_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + simde_int64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s64_x3 + #define vld1_s64_x3(a) simde_vld1_s64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t +simde_vld1_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x3(ptr); + #else + simde_uint8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x3_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + simde_uint8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u8_x3 + #define vld1_u8_x3(a) simde_vld1_u8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t +simde_vld1_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x3(ptr); + #else + simde_uint16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x3_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + simde_uint16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u16_x3 + #define vld1_u16_x3(a) simde_vld1_u16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t +simde_vld1_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x3(ptr); + #else + simde_uint32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x3_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + simde_uint32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u32_x3 + #define vld1_u32_x3(a) simde_vld1_u32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t +simde_vld1_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x3(ptr); + #else + simde_uint64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x3_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + simde_uint64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u64_x3 + #define vld1_u64_x3(a) simde_vld1_u64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld1_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p8_x3(ptr); + #else + simde_poly8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x3_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1_p8_x3 + #define vld1_p8_x3(a) simde_vld1_p8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld1_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p16_x3(ptr); + #else + simde_poly16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x3_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1_p16_x3 + #define vld1_p16_x3(a) simde_vld1_p16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld1_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x3(ptr); + #else + simde_poly64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x3_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_p64_x3 + #define vld1_p64_x3(a) simde_vld1_p64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld1_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x3(ptr); + #else + simde_bfloat16x4_private a_[3]; + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x3_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1_bf16_x3 + #define vld1_bf16_x3(a) simde_vld1_bf16_x3((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X3_H) */ diff --git a/arm/neon/ld1_x4.h b/arm/neon/ld1_x4.h new file mode 100644 index 000000000..1f70daacb --- /dev/null +++ b/arm/neon/ld1_x4.h @@ -0,0 +1,545 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X4_H) +#define SIMDE_ARM_NEON_LD1_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld1_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x4(ptr); + #else + simde_float16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + a_[3].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+12) , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]), + simde_float16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f16_x4 + #define vld1_f16_x4(a) simde_vld1_f16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x4_t +simde_vld1_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x4(ptr); + #else + simde_float32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_f32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]), + simde_float32x2_from_private(a_[2]), + simde_float32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_f32_x4 + #define vld1_f32_x4(a) simde_vld1_f32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x4_t +simde_vld1_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x4(ptr); + #else + simde_float64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_f64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]), + simde_float64x1_from_private(a_[2]), + simde_float64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1_f64_x4 + #define vld1_f64_x4(a) simde_vld1_f64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x4_t +simde_vld1_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x4(ptr); + #else + simde_int8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_i8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + simde_int8x8_from_private(a_[2]), + simde_int8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s8_x4 + #define vld1_s8_x4(a) simde_vld1_s8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x4_t +simde_vld1_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x4(ptr); + #else + simde_int16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_i16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + simde_int16x4_from_private(a_[2]), + simde_int16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s16_x4 + #define vld1_s16_x4(a) simde_vld1_s16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x4_t +simde_vld1_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x4(ptr); + #else + simde_int32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_i32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + simde_int32x2_from_private(a_[2]), + simde_int32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s32_x4 + #define vld1_s32_x4(a) simde_vld1_s32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x4_t +simde_vld1_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x4(ptr); + #else + simde_int64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_i64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + simde_int64x1_from_private(a_[2]), + simde_int64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_s64_x4 + #define vld1_s64_x4(a) simde_vld1_s64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x4_t +simde_vld1_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x4(ptr); + #else + simde_uint8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + simde_uint8x8_from_private(a_[2]), + simde_uint8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u8_x4 + #define vld1_u8_x4(a) simde_vld1_u8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x4_t +simde_vld1_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x4(ptr); + #else + simde_uint16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + simde_uint16x4_from_private(a_[2]), + simde_uint16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u16_x4 + #define vld1_u16_x4(a) simde_vld1_u16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x4_t +simde_vld1_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x4(ptr); + #else + simde_uint32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_u32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + simde_uint32x2_from_private(a_[2]), + simde_uint32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u32_x4 + #define vld1_u32_x4(a) simde_vld1_u32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x4_t +simde_vld1_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x4(ptr); + #else + simde_uint64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + simde_uint64x1_from_private(a_[2]), + simde_uint64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_u64_x4 + #define vld1_u64_x4(a) simde_vld1_u64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld1_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p8_x4(ptr); + #else + simde_poly8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]), + simde_poly8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_p8_x4 + #define vld1_p8_x4(a) simde_vld1_p8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld1_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p16_x4(ptr); + #else + simde_poly16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]), + simde_poly16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1_p16_x4 + #define vld1_p16_x4(a) simde_vld1_p16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld1_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x4(ptr); + #else + simde_poly64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]), + simde_poly64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1_p64_x4 + #define vld1_p64_x4(a) simde_vld1_p64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld1_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x4(ptr); + #else + simde_bfloat16x4_private a_[4]; + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]), + simde_bfloat16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1_bf16_x4 + #define vld1_bf16_x4(a) simde_vld1_bf16_x4((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X4_H) */ diff --git a/arm/neon/ld1q_x2.h b/arm/neon/ld1q_x2.h new file mode 100644 index 000000000..1663cc1f0 --- /dev/null +++ b/arm/neon/ld1q_x2.h @@ -0,0 +1,486 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X2_H) +#define SIMDE_ARM_NEON_LD1Q_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld1q_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f16_x2(ptr); + #else + simde_float16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x2_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f16_x2 + #define vld1q_f16_x2(a) simde_vld1q_f16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t +simde_vld1q_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x2(ptr); + #else + simde_float32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x2_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f32_x2 + #define vld1q_f32_x2(a) simde_vld1q_f32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t +simde_vld1q_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x2(ptr); + #else + simde_float64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x2_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1q_f64_x2 + #define vld1q_f64_x2(a) simde_vld1q_f64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t +simde_vld1q_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x2(ptr); + #else + simde_int8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x2_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s8_x2 + #define vld1q_s8_x2(a) simde_vld1q_s8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t +simde_vld1q_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x2(ptr); + #else + simde_int16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x2_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s16_x2 + #define vld1q_s16_x2(a) simde_vld1q_s16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t +simde_vld1q_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x2(ptr); + #else + simde_int32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x2_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s32_x2 + #define vld1q_s32_x2(a) simde_vld1q_s32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t +simde_vld1q_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x2(ptr); + #else + simde_int64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x2_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s64_x2 + #define vld1q_s64_x2(a) simde_vld1q_s64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t +simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x2(ptr); + #else + simde_uint8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u8_x2 + #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t +simde_vld1q_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x2(ptr); + #else + simde_uint16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x2_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u16_x2 + #define vld1q_u16_x2(a) simde_vld1q_u16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t +simde_vld1q_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x2(ptr); + #else + simde_uint32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x2_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u32_x2 + #define vld1q_u32_x2(a) simde_vld1q_u32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t +simde_vld1q_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x2(ptr); + #else + simde_uint64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x2_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) +#undef vld1q_u64_x2 + #define vld1q_u64_x2(a) simde_vld1q_u64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld1q_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x2(ptr); + #else + simde_poly8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x2_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p8_x2 + #define vld1q_p8_x2(a) simde_vld1q_p8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld1q_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x2(ptr); + #else + simde_poly16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x2_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p16_x2 + #define vld1q_p16_x2(a) simde_vld1q_p16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld1q_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x2(ptr); + #else + simde_poly64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x2_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p64_x2 + #define vld1q_p64_x2(a) simde_vld1q_p64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld1q_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x2(ptr); + #else + simde_bfloat16x8_private a_[2]; + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x2_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_bf16_x2 + #define vld1q_bf16_x2(a) simde_vld1q_bf16_x2((a)) +#endif + + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X2_H) */ diff --git a/arm/neon/ld1q_x3.h b/arm/neon/ld1q_x3.h new file mode 100644 index 000000000..9f10cfc3c --- /dev/null +++ b/arm/neon/ld1q_x3.h @@ -0,0 +1,514 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X3_H) +#define SIMDE_ARM_NEON_LD1Q_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld1q_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f16_x3(ptr); + #else + simde_float16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x3_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f16_x3 + #define vld1q_f16_x3(a) simde_vld1q_f16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t +simde_vld1q_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x3(ptr); + #else + simde_float32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x3_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]), + simde_float32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f32_x3 + #define vld1q_f32_x3(a) simde_vld1q_f32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t +simde_vld1q_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x3(ptr); + #else + simde_float64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x3_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]), + simde_float64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1q_f64_x3 + #define vld1q_f64_x3(a) simde_vld1q_f64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t +simde_vld1q_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x3(ptr); + #else + simde_int8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x3_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + simde_int8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s8_x3 + #define vld1q_s8_x3(a) simde_vld1q_s8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t +simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x3(ptr); + #else + simde_int16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x3_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]), + simde_int16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s16_x3 + #define vld1q_s16_x3(a) simde_vld1q_s16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t +simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x3(ptr); + #else + simde_int32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x3_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + simde_int32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s32_x3 + #define vld1q_s32_x3(a) simde_vld1q_s32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t +simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x3(ptr); + #else + simde_int64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x3_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s64_x3 + #define vld1q_s64_x3(a) simde_vld1q_s64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t +simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x3(ptr); + #else + simde_uint8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]), + simde_uint8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u8_x3 + #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t +simde_vld1q_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x3(ptr); + #else + simde_uint16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x3_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]), + simde_uint16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u16_x3 + #define vld1q_u16_x3(a) simde_vld1q_u16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t +simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x3(ptr); + #else + simde_uint32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x3_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]), + simde_uint32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u32_x3 + #define vld1q_u32_x3(a) simde_vld1q_u32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t +simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x3(ptr); + #else + simde_uint64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x3_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]), + simde_uint64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u64_x3 + #define vld1q_u64_x3(a) simde_vld1q_u64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld1q_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x3(ptr); + #else + simde_poly8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x3_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p8_x3 + #define vld1q_p8_x3(a) simde_vld1q_p8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld1q_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x3(ptr); + #else + simde_poly16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x3_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p16_x3 + #define vld1q_p16_x3(a) simde_vld1q_p16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x3(ptr); + #else + simde_poly64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x3_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_p64_x3 + #define vld1q_p64_x3(a) simde_vld1q_p64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld1q_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x3(ptr); + #else + simde_bfloat16x8_private a_[3]; + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x3_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1q_bf16_x3 + #define vld1q_bf16_x3(a) simde_vld1q_bf16_x3((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X3_H) */ diff --git a/arm/neon/ld1q_x4.h b/arm/neon/ld1q_x4.h new file mode 100644 index 000000000..0811634fa --- /dev/null +++ b/arm/neon/ld1q_x4.h @@ -0,0 +1,544 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X4_H) +#define SIMDE_ARM_NEON_LD1Q_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld1q_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f16_x4(ptr); + #else + simde_float16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + a_[3].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+24) , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]), + simde_float16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f16_x4 + #define vld1q_f16_x4(a) simde_vld1q_f16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x4_t +simde_vld1q_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x4(ptr); + #else + simde_float32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_f32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]), + simde_float32x4_from_private(a_[2]), + simde_float32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_f32_x4 + #define vld1q_f32_x4(a) simde_vld1q_f32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x4_t +simde_vld1q_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x4(ptr); + #else + simde_float64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_f64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]), + simde_float64x2_from_private(a_[2]), + simde_float64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) + #undef vld1q_f64_x4 + #define vld1q_f64_x4(a) simde_vld1q_f64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x4_t +simde_vld1q_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x4(ptr); + #else + simde_int8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_i8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + simde_int8x16_from_private(a_[2]), + simde_int8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s8_x4 + #define vld1q_s8_x4(a) simde_vld1q_s8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x4_t +simde_vld1q_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x4(ptr); + #else + simde_int16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_i16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]), + simde_int16x8_from_private(a_[2]), + simde_int16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s16_x4 + #define vld1q_s16_x4(a) simde_vld1q_s16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x4_t +simde_vld1q_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x4(ptr); + #else + simde_int32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_i32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + simde_int32x4_from_private(a_[2]), + simde_int32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s32_x4 + #define vld1q_s32_x4(a) simde_vld1q_s32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x4_t +simde_vld1q_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x4(ptr); + #else + simde_int64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_i64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_s64_x4 + #define vld1q_s64_x4(a) simde_vld1q_s64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x4_t +simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x4(ptr); + #else + simde_uint8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]), + simde_uint8x16_from_private(a_[2]), + simde_uint8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u8_x4 + #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x4_t +simde_vld1q_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x4(ptr); + #else + simde_uint16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]), + simde_uint16x8_from_private(a_[2]), + simde_uint16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u16_x4 + #define vld1q_u16_x4(a) simde_vld1q_u16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x4_t +simde_vld1q_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x4(ptr); + #else + simde_uint32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_u32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]), + simde_uint32x4_from_private(a_[2]), + simde_uint32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u32_x4 + #define vld1q_u32_x4(a) simde_vld1q_u32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x4_t +simde_vld1q_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x4(ptr); + #else + simde_uint64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]), + simde_uint64x2_from_private(a_[2]), + simde_uint64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) + #undef vld1q_u64_x4 + #define vld1q_u64_x4(a) simde_vld1q_u64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld1q_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x4(ptr); + #else + simde_poly8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]), + simde_poly8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1q_p8_x4 + #define vld1q_p8_x4(a) simde_vld1q_p8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld1q_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x4(ptr); + #else + simde_poly16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]), + simde_poly16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1q_p16_x4 + #define vld1q_p16_x4(a) simde_vld1q_p16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld1q_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x4(ptr); + #else + simde_poly64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]), + simde_poly64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vld1q_p64_x4 + #define vld1q_p64_x4(a) simde_vld1q_p64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld1q_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x4(ptr); + #else + simde_bfloat16x8_private a_[4]; + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]), + simde_bfloat16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld1q_bf16_x4 + #define vld1q_bf16_x4(a) simde_vld1q_bf16_x4((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X4_H) */ diff --git a/arm/neon/ld2.h b/arm/neon/ld2.h index 70cb39af7..b22c80c05 100644 --- a/arm/neon/ld2.h +++ b/arm/neon/ld2.h @@ -22,6 +22,8 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD2_H) @@ -57,6 +59,16 @@ simde_vld2_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_s8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int8x8_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x8x2_t r = { { + simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int8x16_private a_ = simde_int8x16_to_private(simde_vld1q_s8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -90,6 +102,16 @@ simde_int16x4x2_t simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int16x4_private a_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x4x2_t r = { { + simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int16x8_private a_ = simde_int16x8_to_private(simde_vld1q_s16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -97,6 +119,10 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_memcpy(&r, &a_, sizeof(r)); return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int16x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -104,6 +130,9 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_int16x4x2_t r = { { simde_int16x4_from_private(r_[0]), @@ -123,6 +152,16 @@ simde_int32x2x2_t simde_vld2_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int32x2_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x2x2_t r = { { + simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int32x4_private a_ = simde_int32x4_to_private(simde_vld1q_s32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -156,6 +195,16 @@ simde_int64x1x2_t simde_vld2_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int64x1_private a_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x1x2_t r = { { + simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int64x2_private a_ = simde_int64x2_to_private(simde_vld1q_s64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -200,6 +249,16 @@ simde_vld2_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_u8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint8x8_private a_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x8x2_t r = { { + simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint8x16_private a_ = simde_uint8x16_to_private(simde_vld1q_u8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -233,6 +292,16 @@ simde_uint16x4x2_t simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint16x4_private a_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x4x2_t r = { { + simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint16x8_private a_ = simde_uint16x8_to_private(simde_vld1q_u16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -240,6 +309,10 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_memcpy(&r, &a_, sizeof(r)); return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint16x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -247,6 +320,9 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_uint16x4x2_t r = { { simde_uint16x4_from_private(r_[0]), @@ -266,6 +342,16 @@ simde_uint32x2x2_t simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint32x2_private a_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x2x2_t r = { { + simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint32x4_private a_ = simde_uint32x4_to_private(simde_vld1q_u32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -296,9 +382,19 @@ simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x1x2_t -simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint64x1_private a_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x1x2_t r = { { + simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint64x2_private a_ = simde_uint64x2_to_private(simde_vld1q_u64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -327,11 +423,60 @@ simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2_u64(a) simde_vld2_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld2_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x4_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x4x2_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + } }; + return r; + #else + simde_float16x4_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_float16x4x2_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2_f16 + #define vld2_f16(a) simde_vld2_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float32x2_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x2x2_t r = { { + simde_float32x2_from_private(r_[0]), + simde_float32x2_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float32x4_private a_ = simde_float32x4_to_private(simde_vld1q_f32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -362,9 +507,19 @@ simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1x2_t -simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float64x1_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x1x2_t r = { { + simde_float64x1_from_private(r_[0]), + simde_float64x1_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float64x2_private a_ = simde_float64x2_to_private(simde_vld1q_f64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -398,6 +553,16 @@ simde_int8x16x2_t simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int8x16_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x16x2_t r = { { + simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s8( @@ -405,6 +570,10 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { simde_vld1q_s8(&(ptr[16])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int8x16_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -419,6 +588,9 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -431,6 +603,16 @@ simde_int32x4x2_t simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int32x4_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x4x2_t r = { { + simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s32( @@ -438,6 +620,10 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_s32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -445,6 +631,9 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_int32x4x2_t r = { { simde_int32x4_from_private(r_[0]), @@ -464,6 +653,16 @@ simde_int16x8x2_t simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int16x8_private r_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x8x2_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s16( @@ -471,6 +670,10 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vld1q_s16(&(ptr[8])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int16x8_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -485,6 +688,9 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -497,6 +703,16 @@ simde_int64x2x2_t simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int64x2_private r_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x2x2_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + } }; + return r; #else simde_int64x2_private r_[2]; @@ -524,6 +740,16 @@ simde_uint8x16x2_t simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint8x16_private r_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x16x2_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u8( @@ -531,6 +757,10 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { simde_vld1q_u8(&(ptr[16])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint8x16_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -545,6 +775,9 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -557,6 +790,16 @@ simde_uint16x8x2_t simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint16x8_private r_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x8x2_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u16( @@ -564,6 +807,10 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vld1q_u16(&(ptr[8])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint16x8_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -578,6 +825,9 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -590,6 +840,16 @@ simde_uint32x4x2_t simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint32x4_private r_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x4x2_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u32( @@ -597,6 +857,10 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_u32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -604,6 +868,9 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_uint32x4x2_t r = { { simde_uint32x4_from_private(r_[0]), @@ -623,6 +890,16 @@ simde_uint64x2x2_t simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint64x2_private r_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x2x2_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + } }; + return r; #else simde_uint64x2_private r_[2]; @@ -645,11 +922,67 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2q_u64(a) simde_vld2q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld2q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2q_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x8x2_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + } }; + return r; + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_float16x8_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif + + simde_float16x8x2_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2q_f16 + #define vld2q_f16(a) simde_vld2q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float32x4_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x4x2_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_f32( @@ -657,6 +990,10 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_f32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_float32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { @@ -664,6 +1001,9 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_float32x4x2_t r = { { simde_float32x4_from_private(r_[0]), @@ -683,6 +1023,16 @@ simde_float64x2x2_t simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float64x2_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x2x2_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + } }; + return r; #else simde_float64x2_private r_[2]; @@ -705,6 +1055,278 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2q_f64(a) simde_vld2q_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld2_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_p8(ptr); + #else + simde_poly8x8_private r_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_poly8x8x2_t r = { { + simde_poly8x8_from_private(r_[0]), + simde_poly8x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_p8 + #define vld2_p8(a) simde_vld2_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld2_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_p16(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly16x4_private r_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif + + simde_poly16x4x2_t r = { { + simde_poly16x4_from_private(r_[0]), + simde_poly16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_p16 + #define vld2_p16(a) simde_vld2_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld2_p64(ptr); + #else + simde_poly64x1_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x1x2_t r = { { + simde_poly64x1_from_private(r_[0]), + simde_poly64x1_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_p64 + #define vld2_p64(a) simde_vld2_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld2q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2q_p8(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly8x16_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x16x2_t r = { { + simde_poly8x16_from_private(r_[0]), + simde_poly8x16_from_private(r_[1]), + } }; + + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_p8 + #define vld2q_p8(a) simde_vld2q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld2q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2q_p16(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly16x8_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x8x2_t r = { { + simde_poly16x8_from_private(r_[0]), + simde_poly16x8_from_private(r_[1]), + } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_p16 + #define vld2q_p16(a) simde_vld2q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_p64(ptr); + #else + simde_poly64x2_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x2x2_t r = { { + simde_poly64x2_from_private(r_[0]), + simde_poly64x2_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_p64 + #define vld2q_p64(a) simde_vld2q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld2_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2_bf16(ptr); + #else + simde_bfloat16x4_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x4x2_t r = { { + simde_bfloat16x4_from_private(r_[0]), + simde_bfloat16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2_bf16 + #define vld2_bf16(a) simde_vld2_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld2q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2q_bf16(ptr); + #else + simde_bfloat16x8_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x8x2_t r = { { + simde_bfloat16x8_from_private(r_[0]), + simde_bfloat16x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2q_bf16 + #define vld2q_bf16(a) simde_vld2q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/ld2_dup.h b/arm/neon/ld2_dup.h new file mode 100644 index 000000000..a06a13edc --- /dev/null +++ b/arm/neon/ld2_dup.h @@ -0,0 +1,620 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD2_DUP_H) +#define SIMDE_ARM_NEON_LD2_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld2_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2_dup_f16(ptr); + #else + simde_float16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2_dup_f16 + #define vld2_dup_f16(a) simde_vld2_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t +simde_vld2_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_f32(ptr); + #else + simde_float32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_f32 + #define vld2_dup_f32(a) simde_vld2_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t +simde_vld2_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2_dup_f64(ptr); + #else + simde_float64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_f64 + #define vld2_dup_f64(a) simde_vld2_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t +simde_vld2_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s8(ptr); + #else + simde_int8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s8 + #define vld2_dup_s8(a) simde_vld2_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t +simde_vld2_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s16(ptr); + #else + simde_int16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s16 + #define vld2_dup_s16(a) simde_vld2_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t +simde_vld2_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s32(ptr); + #else + simde_int32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s32 + #define vld2_dup_s32(a) simde_vld2_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t +simde_vld2_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s64(ptr); + #else + simde_int64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s64 + #define vld2_dup_s64(a) simde_vld2_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t +simde_vld2_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u8(ptr); + #else + simde_uint8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u8 + #define vld2_dup_u8(a) simde_vld2_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t +simde_vld2_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u16(ptr); + #else + simde_uint16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u16 + #define vld2_dup_u16(a) simde_vld2_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t +simde_vld2_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u32(ptr); + #else + simde_uint32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u32 + #define vld2_dup_u32(a) simde_vld2_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t +simde_vld2_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u64(ptr); + #else + simde_uint64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u64 + #define vld2_dup_u64(a) simde_vld2_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld2q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2q_dup_f16(ptr); + #else + simde_float16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2q_dup_f16 + #define vld2q_dup_f16(a) simde_vld2q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t +simde_vld2q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_f32(ptr); + #else + simde_float32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_f32 + #define vld2q_dup_f32(a) simde_vld2q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t +simde_vld2q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_f64(ptr); + #else + simde_float64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_f64 + #define vld2q_dup_f64(a) simde_vld2q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t +simde_vld2q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s8(ptr); + #else + simde_int8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s8 + #define vld2q_dup_s8(a) simde_vld2q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t +simde_vld2q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s16(ptr); + #else + simde_int16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s16 + #define vld2q_dup_s16(a) simde_vld2q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t +simde_vld2q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s32(ptr); + #else + simde_int32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s32 + #define vld2q_dup_s32(a) simde_vld2q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t +simde_vld2q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s64(ptr); + #else + simde_int64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s64 + #define vld2q_dup_s64(a) simde_vld2q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t +simde_vld2q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u8(ptr); + #else + simde_uint8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u8 + #define vld2q_dup_u8(a) simde_vld2q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t +simde_vld2q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u16(ptr); + #else + simde_uint16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u16 + #define vld2q_dup_u16(a) simde_vld2q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t +simde_vld2q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u32(ptr); + #else + simde_uint32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u32 + #define vld2q_dup_u32(a) simde_vld2q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t +simde_vld2q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u64(ptr); + #else + simde_uint64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u64 + #define vld2q_dup_u64(a) simde_vld2q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld2_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_p8(ptr); + #else + simde_poly8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p8 + #define vld2_dup_p8(a) simde_vld2_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld2_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_p16(ptr); + #else + simde_poly16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p16 + #define vld2_dup_p16(a) simde_vld2_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld2_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld2_dup_p64(ptr); + #else + simde_poly64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p64 + #define vld2_dup_p64(a) simde_vld2_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld2q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763) + return vld2q_dup_p8(ptr); + #else + simde_poly8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763))) + #undef vld2q_dup_p8 + #define vld2q_dup_p8(a) simde_vld2q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld2q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763) + return vld2q_dup_p16(ptr); + #else + simde_poly16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763))) + #undef vld2q_dup_p16 + #define vld2q_dup_p16(a) simde_vld2q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld2q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_p64(ptr); + #else + simde_poly64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_p64 + #define vld2q_dup_p64(a) simde_vld2q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld2_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2_dup_bf16(ptr); + #else + simde_bfloat16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2_dup_bf16 + #define vld2_dup_bf16(a) simde_vld2_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld2q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2q_dup_bf16(ptr); + #else + simde_bfloat16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2q_dup_bf16 + #define vld2q_dup_bf16(a) simde_vld2q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD2_DUP_H) */ diff --git a/arm/neon/ld2_lane.h b/arm/neon/ld2_lane.h new file mode 100644 index 000000000..b66cd456c --- /dev/null +++ b/arm/neon/ld2_lane.h @@ -0,0 +1,642 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD2_LANE_H) +#define SIMDE_ARM_NEON_LD2_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t simde_vld2_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s8(ptr, src, lane) vld2_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s8 + #define vld2_lane_s8(ptr, src, lane) simde_vld2_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t simde_vld2_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s16(ptr, src, lane) vld2_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s16 + #define vld2_lane_s16(ptr, src, lane) simde_vld2_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t simde_vld2_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s32(ptr, src, lane) vld2_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s32 + #define vld2_lane_s32(ptr, src, lane) simde_vld2_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t simde_vld2_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_int64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_s64(ptr, src, lane) vld2_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s64 + #define vld2_lane_s64(ptr, src, lane) simde_vld2_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t simde_vld2_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u8(ptr, src, lane) vld2_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u8 + #define vld2_lane_u8(ptr, src, lane) simde_vld2_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t simde_vld2_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u16(ptr, src, lane) vld2_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u16 + #define vld2_lane_u16(ptr, src, lane) simde_vld2_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t simde_vld2_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u32(ptr, src, lane) vld2_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u32 + #define vld2_lane_u32(ptr, src, lane) simde_vld2_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t simde_vld2_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_uint64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_u64(ptr, src, lane) vld2_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u64 + #define vld2_lane_u64(ptr, src, lane) simde_vld2_lane_u64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t simde_vld2_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld2_lane_f16(ptr, src, lane) vld2_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2_lane_f16 + #define vld2_lane_f16(ptr, src, lane) simde_vld2_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t simde_vld2_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_f32(ptr, src, lane) vld2_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_f32 + #define vld2_lane_f32(ptr, src, lane) simde_vld2_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t simde_vld2_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_f64(ptr, src, lane) vld2_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_f64 + #define vld2_lane_f64(ptr, src, lane) simde_vld2_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t simde_vld2q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_int8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_s8(ptr, src, lane) vld2q_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s8 + #define vld2q_lane_s8(ptr, src, lane) simde_vld2q_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t simde_vld2q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s16 + #define vld2q_lane_s16(ptr, src, lane) simde_vld2q_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t simde_vld2q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s32 + #define vld2q_lane_s32(ptr, src, lane) simde_vld2q_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t simde_vld2q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_s64(ptr, src, lane) vld2q_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s64 + #define vld2q_lane_s64(ptr, src, lane) simde_vld2q_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t simde_vld2q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_uint8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_u8(ptr, src, lane) vld2q_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u8 + #define vld2q_lane_u8(ptr, src, lane) simde_vld2q_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t simde_vld2q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u16 + #define vld2q_lane_u16(ptr, src, lane) simde_vld2q_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t simde_vld2q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u32 + #define vld2q_lane_u32(ptr, src, lane) simde_vld2q_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t simde_vld2q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_u64(ptr, src, lane) vld2q_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u64 + #define vld2q_lane_u64(ptr, src, lane) simde_vld2q_lane_u64((ptr), (src), (lane)) +#endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t simde_vld2q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld2q_lane_f16(ptr, src, lane) vld2q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld2q_lane_f16 + #define vld2q_lane_f16(ptr, src, lane) simde_vld2q_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t simde_vld2q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_f32 + #define vld2q_lane_f32(ptr, src, lane) simde_vld2q_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t simde_vld2q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_f64(ptr, src, lane) vld2q_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_f64 + #define vld2q_lane_f64(ptr, src, lane) simde_vld2q_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t simde_vld2_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_p8(ptr, src, lane) vld2_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p8 + #define vld2_lane_p8(ptr, src, lane) simde_vld2_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t simde_vld2_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_p16(ptr, src, lane) vld2_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p16 + #define vld2_lane_p16(ptr, src, lane) simde_vld2_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t simde_vld2_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_p64(ptr, src, lane) vld2_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p64 + #define vld2_lane_p64(ptr, src, lane) simde_vld2_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t simde_vld2q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_p8(ptr, src, lane) vld2q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p8 + #define vld2q_lane_p8(ptr, src, lane) simde_vld2q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t simde_vld2q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_p16(ptr, src, lane) vld2q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p16 + #define vld2q_lane_p16(ptr, src, lane) simde_vld2q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t simde_vld2q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_p64(ptr, src, lane) vld2q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p64 + #define vld2q_lane_p64(ptr, src, lane) simde_vld2q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t simde_vld2_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld2_lane_bf16(ptr, src, lane) vld2_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2_lane_bf16 + #define vld2_lane_bf16(ptr, src, lane) simde_vld2_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t simde_vld2q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld2q_lane_bf16(ptr, src, lane) vld2q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld2q_lane_bf16 + #define vld2q_lane_bf16(ptr, src, lane) simde_vld2q_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD2_LANE_H) */ diff --git a/arm/neon/ld3.h b/arm/neon/ld3.h index e13eff1db..eddc86a51 100644 --- a/arm/neon/ld3.h +++ b/arm/neon/ld3.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD3_H) @@ -40,6 +42,41 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld3_f16(simde_float16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3_f16(ptr); + #else + simde_float16x4_private r_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) && \ + SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_float16x4x3_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + simde_float16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3_f16 + #define vld3_f16(a) simde_vld3_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x3_t simde_vld3_f32(simde_float32 const *ptr) { @@ -47,13 +84,18 @@ simde_vld3_f32(simde_float32 const *ptr) { return vld3_f32(ptr); #else simde_float32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float32x2x3_t r = { { simde_float32x2_from_private(r_[0]), simde_float32x2_from_private(r_[1]), @@ -75,13 +117,18 @@ simde_vld3_f64(simde_float64 const *ptr) { return vld3_f64(ptr); #else simde_float64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float64x1x3_t r = { { simde_float64x1_from_private(r_[0]), simde_float64x1_from_private(r_[1]), @@ -103,13 +150,18 @@ simde_vld3_s8(int8_t const *ptr) { return vld3_s8(ptr); #else simde_int8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int8x8x3_t r = { { simde_int8x8_from_private(r_[0]), simde_int8x8_from_private(r_[1]), @@ -131,13 +183,18 @@ simde_vld3_s16(int16_t const *ptr) { return vld3_s16(ptr); #else simde_int16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int16x4x3_t r = { { simde_int16x4_from_private(r_[0]), simde_int16x4_from_private(r_[1]), @@ -159,13 +216,18 @@ simde_vld3_s32(int32_t const *ptr) { return vld3_s32(ptr); #else simde_int32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int32x2x3_t r = { { simde_int32x2_from_private(r_[0]), simde_int32x2_from_private(r_[1]), @@ -187,13 +249,18 @@ simde_vld3_s64(int64_t const *ptr) { return vld3_s64(ptr); #else simde_int64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int64x1x3_t r = { { simde_int64x1_from_private(r_[0]), simde_int64x1_from_private(r_[1]), @@ -203,7 +270,7 @@ simde_vld3_s64(int64_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld3_s64 #define vld3_s64(a) simde_vld3_s64((a)) #endif @@ -215,13 +282,18 @@ simde_vld3_u8(uint8_t const *ptr) { return vld3_u8(ptr); #else simde_uint8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint8x8x3_t r = { { simde_uint8x8_from_private(r_[0]), simde_uint8x8_from_private(r_[1]), @@ -243,13 +315,18 @@ simde_vld3_u16(uint16_t const *ptr) { return vld3_u16(ptr); #else simde_uint16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint16x4x3_t r = { { simde_uint16x4_from_private(r_[0]), simde_uint16x4_from_private(r_[1]), @@ -271,13 +348,18 @@ simde_vld3_u32(uint32_t const *ptr) { return vld3_u32(ptr); #else simde_uint32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint32x2x3_t r = { { simde_uint32x2_from_private(r_[0]), simde_uint32x2_from_private(r_[1]), @@ -299,13 +381,18 @@ simde_vld3_u64(uint64_t const *ptr) { return vld3_u64(ptr); #else simde_uint64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint64x1x3_t r = { { simde_uint64x1_from_private(r_[0]), simde_uint64x1_from_private(r_[1]), @@ -315,16 +402,63 @@ simde_vld3_u64(uint64_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld3_u64 #define vld3_u64(a) simde_vld3_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld3q_f16(simde_float16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3q_f16(ptr); + #else + simde_float16x8_private r_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_float16x8x3_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + simde_float16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3q_f16 + #define vld3q_f16(a) simde_vld3q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x3_t simde_vld3q_f32(simde_float32 const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float32x4_private r_[3]; + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + simde_float32x4x3_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + simde_float32x4_from_private(r_[2]) + } }; + return r; #else simde_float32x4_private r_[3]; @@ -353,6 +487,18 @@ simde_float64x2x3_t simde_vld3q_f64(simde_float64 const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float64x2_private r_[3]; + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + simde_float64x2x3_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + simde_float64x2_from_private(r_[2]) + } }; + return r; #else simde_float64x2_private r_[3]; @@ -381,6 +527,18 @@ simde_int8x16x3_t simde_vld3q_s8(int8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int8x16_private r_[3]; + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + simde_int8x16x3_t r = { { + simde_int8x16_from_private(r_[0]), + simde_int8x16_from_private(r_[1]), + simde_int8x16_from_private(r_[2]) + } }; + return r; #else simde_int8x16_private r_[3]; @@ -409,6 +567,18 @@ simde_int16x8x3_t simde_vld3q_s16(int16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int16x8_private r_[3]; + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + simde_int16x8x3_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + simde_int16x8_from_private(r_[2]) + } }; + return r; #else simde_int16x8_private r_[3]; @@ -437,6 +607,18 @@ simde_int32x4x3_t simde_vld3q_s32(int32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int32x4_private r_[3]; + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + simde_int32x4x3_t r = { { + simde_int32x4_from_private(r_[0]), + simde_int32x4_from_private(r_[1]), + simde_int32x4_from_private(r_[2]) + } }; + return r; #else simde_int32x4_private r_[3]; @@ -465,6 +647,18 @@ simde_int64x2x3_t simde_vld3q_s64(int64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int64x2_private r_[3]; + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + simde_int64x2x3_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + simde_int64x2_from_private(r_[2]) + } }; + return r; #else simde_int64x2_private r_[3]; @@ -494,6 +688,18 @@ simde_uint8x16x3_t simde_vld3q_u8(uint8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint8x16_private r_[3]; + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + simde_uint8x16x3_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]) + } }; + return r; #else simde_uint8x16_private r_[3]; @@ -522,6 +728,18 @@ simde_uint16x8x3_t simde_vld3q_u16(uint16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint16x8_private r_[3]; + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + simde_uint16x8x3_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + simde_uint16x8_from_private(r_[2]) + } }; + return r; #else simde_uint16x8_private r_[3]; @@ -550,6 +768,18 @@ simde_uint32x4x3_t simde_vld3q_u32(uint32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint32x4_private r_[3]; + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + simde_uint32x4x3_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + simde_uint32x4_from_private(r_[2]) + } }; + return r; #else simde_uint32x4_private r_[3]; @@ -578,6 +808,18 @@ simde_uint64x2x3_t simde_vld3q_u64(uint64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint64x2_private r_[3]; + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + simde_uint64x2x3_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + simde_uint64x2_from_private(r_[2]) + } }; + return r; #else simde_uint64x2_private r_[3]; @@ -601,6 +843,274 @@ simde_vld3q_u64(uint64_t const *ptr) { #define vld3q_u64(a) simde_vld3q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld3_p8(simde_poly8_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_p8(ptr); + #else + simde_poly8x8_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x8x3_t r = { { + simde_poly8x8_from_private(r_[0]), + simde_poly8x8_from_private(r_[1]), + simde_poly8x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_p8 + #define vld3_p8(a) simde_vld3_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld3_p16(simde_poly16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_p16(ptr); + #else + simde_poly16x4_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x4x3_t r = { { + simde_poly16x4_from_private(r_[0]), + simde_poly16x4_from_private(r_[1]), + simde_poly16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_p16 + #define vld3_p16(a) simde_vld3_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld3_p64(simde_poly64_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld3_p64(ptr); + #else + simde_poly64x1_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x1x3_t r = { { + simde_poly64x1_from_private(r_[0]), + simde_poly64x1_from_private(r_[1]), + simde_poly64x1_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_p64 + #define vld3_p64(a) simde_vld3_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld3q_p8(simde_poly8_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3q_p8(ptr); + #else + simde_poly8x16_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x16x3_t r = { { + simde_poly8x16_from_private(r_[0]), + simde_poly8x16_from_private(r_[1]), + simde_poly8x16_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_p8 + #define vld3q_p8(a) simde_vld3q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld3q_p16(simde_poly16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3q_p16(ptr); + #else + simde_poly16x8_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x8x3_t r = { { + simde_poly16x8_from_private(r_[0]), + simde_poly16x8_from_private(r_[1]), + simde_poly16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_p16 + #define vld3q_p16(a) simde_vld3q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld3q_p64(simde_poly64_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_p64(ptr); + #else + simde_poly64x2_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x2x3_t r = { { + simde_poly64x2_from_private(r_[0]), + simde_poly64x2_from_private(r_[1]), + simde_poly64x2_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_p64 + #define vld3q_p64(a) simde_vld3q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld3_bf16(simde_bfloat16 const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3_bf16(ptr); + #else + simde_bfloat16x4_private r_[3]; + + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x4x3_t r = { { + simde_bfloat16x4_from_private(r_[0]), + simde_bfloat16x4_from_private(r_[1]), + simde_bfloat16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3_bf16 + #define vld3_bf16(a) simde_vld3_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld3q_bf16(simde_bfloat16 const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3q_bf16(ptr); + #else + simde_bfloat16x8_private r_[3]; + + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x8x3_t r = { { + simde_bfloat16x8_from_private(r_[0]), + simde_bfloat16x8_from_private(r_[1]), + simde_bfloat16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3q_bf16 + #define vld3q_bf16(a) simde_vld3q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/ld3_dup.h b/arm/neon/ld3_dup.h new file mode 100644 index 000000000..86d604481 --- /dev/null +++ b/arm/neon/ld3_dup.h @@ -0,0 +1,616 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD3_DUP_H) +#define SIMDE_ARM_NEON_LD3_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld3_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3_dup_f16(ptr); + #else + simde_float16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3_dup_f16 + #define vld3_dup_f16(a) simde_vld3_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t +simde_vld3_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_f32(ptr); + #else + simde_float32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_f32 + #define vld3_dup_f32(a) simde_vld3_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t +simde_vld3_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3_dup_f64(ptr); + #else + simde_float64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_f64 + #define vld3_dup_f64(a) simde_vld3_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t +simde_vld3_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s8(ptr); + #else + simde_int8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s8 + #define vld3_dup_s8(a) simde_vld3_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t +simde_vld3_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s16(ptr); + #else + simde_int16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s16 + #define vld3_dup_s16(a) simde_vld3_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t +simde_vld3_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s32(ptr); + #else + simde_int32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s32 + #define vld3_dup_s32(a) simde_vld3_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t +simde_vld3_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s64(ptr); + #else + simde_int64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s64 + #define vld3_dup_s64(a) simde_vld3_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t +simde_vld3_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u8(ptr); + #else + simde_uint8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u8 + #define vld3_dup_u8(a) simde_vld3_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t +simde_vld3_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u16(ptr); + #else + simde_uint16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u16 + #define vld3_dup_u16(a) simde_vld3_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t +simde_vld3_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u32(ptr); + #else + simde_uint32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u32 + #define vld3_dup_u32(a) simde_vld3_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t +simde_vld3_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u64(ptr); + #else + simde_uint64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u64 + #define vld3_dup_u64(a) simde_vld3_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld3q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3q_dup_f16(ptr); + #else + simde_float16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3q_dup_f16 + #define vld3q_dup_f16(a) simde_vld3q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t +simde_vld3q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_f32(ptr); + #else + simde_float32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_f32 + #define vld3q_dup_f32(a) simde_vld3q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t +simde_vld3q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_f64(ptr); + #else + simde_float64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_f64 + #define vld3q_dup_f64(a) simde_vld3q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t +simde_vld3q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s8(ptr); + #else + simde_int8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s8 + #define vld3q_dup_s8(a) simde_vld3q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t +simde_vld3q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s16(ptr); + #else + simde_int16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s16 + #define vld3q_dup_s16(a) simde_vld3q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t +simde_vld3q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s32(ptr); + #else + simde_int32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s32 + #define vld3q_dup_s32(a) simde_vld3q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t +simde_vld3q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s64(ptr); + #else + simde_int64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s64 + #define vld3q_dup_s64(a) simde_vld3q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t +simde_vld3q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u8(ptr); + #else + simde_uint8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u8 + #define vld3q_dup_u8(a) simde_vld3q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t +simde_vld3q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u16(ptr); + #else + simde_uint16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u16 + #define vld3q_dup_u16(a) simde_vld3q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t +simde_vld3q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u32(ptr); + #else + simde_uint32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u32 + #define vld3q_dup_u32(a) simde_vld3q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t +simde_vld3q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u64(ptr); + #else + simde_uint64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u64 + #define vld3q_dup_u64(a) simde_vld3q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld3_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3_dup_p8(ptr); + #else + simde_poly8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p8 + #define vld3_dup_p8(a) simde_vld3_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld3_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3_dup_p16(ptr); + #else + simde_poly16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p16 + #define vld3_dup_p16(a) simde_vld3_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld3_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld3_dup_p64(ptr); + #else + simde_poly64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p64 + #define vld3_dup_p64(a) simde_vld3_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld3q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3q_dup_p8(ptr); + #else + simde_poly8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld3q_dup_p8 + #define vld3q_dup_p8(a) simde_vld3q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld3q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3q_dup_p16(ptr); + #else + simde_poly16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld3q_dup_p16 + #define vld3q_dup_p16(a) simde_vld3q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld3q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_p64(ptr); + #else + simde_poly64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_p64 + #define vld3q_dup_p64(a) simde_vld3q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld3_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3_dup_bf16(ptr); + #else + simde_bfloat16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3_dup_bf16 + #define vld3_dup_bf16(a) simde_vld3_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld3q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3q_dup_bf16(ptr); + #else + simde_bfloat16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3q_dup_bf16 + #define vld3q_dup_bf16(a) simde_vld3q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */ diff --git a/arm/neon/ld3_lane.h b/arm/neon/ld3_lane.h new file mode 100644 index 000000000..5072dd4ae --- /dev/null +++ b/arm/neon/ld3_lane.h @@ -0,0 +1,642 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD3_LANE_H) +#define SIMDE_ARM_NEON_LD3_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t simde_vld3_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s8(ptr, src, lane) vld3_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s8 + #define vld3_lane_s8(ptr, src, lane) simde_vld3_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t simde_vld3_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s16(ptr, src, lane) vld3_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s16 + #define vld3_lane_s16(ptr, src, lane) simde_vld3_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t simde_vld3_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s32(ptr, src, lane) vld3_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s32 + #define vld3_lane_s32(ptr, src, lane) simde_vld3_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t simde_vld3_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_int64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_s64(ptr, src, lane) vld3_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s64 + #define vld3_lane_s64(ptr, src, lane) simde_vld3_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t simde_vld3_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u8(ptr, src, lane) vld3_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u8 + #define vld3_lane_u8(ptr, src, lane) simde_vld3_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t simde_vld3_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u16(ptr, src, lane) vld3_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u16 + #define vld3_lane_u16(ptr, src, lane) simde_vld3_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t simde_vld3_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u32(ptr, src, lane) vld3_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u32 + #define vld3_lane_u32(ptr, src, lane) simde_vld3_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t simde_vld3_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_uint64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_u64(ptr, src, lane) vld3_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u64 + #define vld3_lane_u64(ptr, src, lane) simde_vld3_lane_u64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t simde_vld3_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld3_lane_f16(ptr, src, lane) vld3_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3_lane_f16 + #define vld3_lane_f16(ptr, src, lane) simde_vld3_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t simde_vld3_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_f32(ptr, src, lane) vld3_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_f32 + #define vld3_lane_f32(ptr, src, lane) simde_vld3_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t simde_vld3_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_f64(ptr, src, lane) vld3_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_f64 + #define vld3_lane_f64(ptr, src, lane) simde_vld3_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t simde_vld3q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_int8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_s8(ptr, src, lane) vld3q_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s8 + #define vld3q_lane_s8(ptr, src, lane) simde_vld3q_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t simde_vld3q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s16 + #define vld3q_lane_s16(ptr, src, lane) simde_vld3q_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t simde_vld3q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s32 + #define vld3q_lane_s32(ptr, src, lane) simde_vld3q_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t simde_vld3q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_s64(ptr, src, lane) vld3q_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s64 + #define vld3q_lane_s64(ptr, src, lane) simde_vld3q_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t simde_vld3q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_uint8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_u8(ptr, src, lane) vld3q_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u8 + #define vld3q_lane_u8(ptr, src, lane) simde_vld3q_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t simde_vld3q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u16 + #define vld3q_lane_u16(ptr, src, lane) simde_vld3q_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t simde_vld3q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u32 + #define vld3q_lane_u32(ptr, src, lane) simde_vld3q_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t simde_vld3q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_u64(ptr, src, lane) vld3q_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u64 + #define vld3q_lane_u64(ptr, src, lane) simde_vld3q_lane_u64((ptr), (src), (lane)) +#endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t simde_vld3q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld3q_lane_f16 + #define vld3q_lane_f16(ptr, src, lane) simde_vld3q_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t simde_vld3q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_f32 + #define vld3q_lane_f32(ptr, src, lane) simde_vld3q_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t simde_vld3q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_f64(ptr, src, lane) vld3q_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_f64 + #define vld3q_lane_f64(ptr, src, lane) simde_vld3q_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t simde_vld3_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_p8(ptr, src, lane) vld3_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p8 + #define vld3_lane_p8(ptr, src, lane) simde_vld3_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t simde_vld3_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_p16(ptr, src, lane) vld3_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p16 + #define vld3_lane_p16(ptr, src, lane) simde_vld3_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t simde_vld3_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_p64(ptr, src, lane) vld3_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p64 + #define vld3_lane_p64(ptr, src, lane) simde_vld3_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t simde_vld3q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_p8(ptr, src, lane) vld3q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p8 + #define vld3q_lane_p8(ptr, src, lane) simde_vld3q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t simde_vld3q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_p16(ptr, src, lane) vld3q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p16 + #define vld3q_lane_p16(ptr, src, lane) simde_vld3q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t simde_vld3q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_p64(ptr, src, lane) vld3q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p64 + #define vld3q_lane_p64(ptr, src, lane) simde_vld3q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t simde_vld3_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld3_lane_bf16(ptr, src, lane) vld3_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3_lane_bf16 + #define vld3_lane_bf16(ptr, src, lane) simde_vld3_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t simde_vld3q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld3q_lane_bf16(ptr, src, lane) vld3q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld3q_lane_bf16 + #define vld3q_lane_bf16(ptr, src, lane) simde_vld3q_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_LANE_H) */ diff --git a/arm/neon/ld4.h b/arm/neon/ld4.h index b93618248..e6a9b9487 100644 --- a/arm/neon/ld4.h +++ b/arm/neon/ld4.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD4_H) @@ -39,6 +41,36 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4_f16(ptr); + #else + simde_float16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]), simde_float16x4_from_private(a_[3]) } }; + return (s_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4_f16 + #define vld4_f16(a) simde_vld4_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x4_t simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { @@ -46,9 +78,17 @@ simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_f32(ptr); #else simde_float32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]), simde_float32x2_from_private(a_[2]), simde_float32x2_from_private(a_[3]) } }; return (s_); @@ -66,9 +106,17 @@ simde_vld4_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_f64(ptr); #else simde_float64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]), simde_float64x1_from_private(a_[2]), simde_float64x1_from_private(a_[3]) } }; return s_; @@ -86,9 +134,17 @@ simde_vld4_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_s8(ptr); #else simde_int8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]), simde_int8x8_from_private(a_[2]), simde_int8x8_from_private(a_[3]) } }; return s_; @@ -106,9 +162,17 @@ simde_vld4_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_s16(ptr); #else simde_int16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]), simde_int16x4_from_private(a_[2]), simde_int16x4_from_private(a_[3]) } }; return s_; @@ -126,9 +190,17 @@ simde_vld4_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_s32(ptr); #else simde_int32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]), simde_int32x2_from_private(a_[2]), simde_int32x2_from_private(a_[3]) } }; return s_; @@ -146,15 +218,23 @@ simde_vld4_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_s64(ptr); #else simde_int64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]), simde_int64x1_from_private(a_[2]), simde_int64x1_from_private(a_[3]) } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld4_s64 #define vld4_s64(a) simde_vld4_s64((a)) #endif @@ -166,9 +246,17 @@ simde_vld4_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_u8(ptr); #else simde_uint8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]), simde_uint8x8_from_private(a_[2]), simde_uint8x8_from_private(a_[3]) } }; return s_; @@ -186,9 +274,17 @@ simde_vld4_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_u16(ptr); #else simde_uint16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]), simde_uint16x4_from_private(a_[2]), simde_uint16x4_from_private(a_[3]) } }; return s_; @@ -206,9 +302,17 @@ simde_vld4_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_u32(ptr); #else simde_uint32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]), simde_uint32x2_from_private(a_[2]), simde_uint32x2_from_private(a_[3]) } }; return s_; @@ -226,19 +330,57 @@ simde_vld4_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_u64(ptr); #else simde_uint64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]), simde_uint64x1_from_private(a_[2]), simde_uint64x1_from_private(a_[3]) } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld4_u64 #define vld4_u64(a) simde_vld4_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4q_f16(ptr); + #else + simde_float16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]), simde_float16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4q_f16 + #define vld4q_f16(a) simde_vld4q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x4_t simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { @@ -246,9 +388,17 @@ simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_f32(ptr); #else simde_float32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]), simde_float32x4_from_private(a_[2]), simde_float32x4_from_private(a_[3]) } }; return s_; @@ -266,9 +416,17 @@ simde_vld4q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_f64(ptr); #else simde_float64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]), simde_float64x2_from_private(a_[2]), simde_float64x2_from_private(a_[3]) } }; return s_; @@ -286,9 +444,17 @@ simde_vld4q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld4q_s8(ptr); #else simde_int8x16_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]), simde_int8x16_from_private(a_[2]), simde_int8x16_from_private(a_[3]) } }; return s_; @@ -306,9 +472,17 @@ simde_vld4q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_s16(ptr); #else simde_int16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]), simde_int16x8_from_private(a_[2]), simde_int16x8_from_private(a_[3]) } }; return s_; @@ -326,9 +500,17 @@ simde_vld4q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_s32(ptr); #else simde_int32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]), simde_int32x4_from_private(a_[2]), simde_int32x4_from_private(a_[3]) } }; return s_; @@ -346,9 +528,17 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_s64(ptr); #else simde_int64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]), simde_int64x2_from_private(a_[2]), simde_int64x2_from_private(a_[3]) } }; return s_; @@ -358,7 +548,6 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #undef vld4q_s64 #define vld4q_s64(a) simde_vld4q_s64((a)) #endif - SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16x4_t simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { @@ -403,6 +592,20 @@ simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { simde_uint8x16_from_private(r_[2]), simde_uint8x16_from_private(r_[3])}}; return s_; + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint8x16_private r_[4]; + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + r_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + simde_uint8x16x4_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]), + simde_uint8x16_from_private(r_[3]) + } }; + return r; #else simde_uint8x16_private a_[4]; for (size_t i = 0; i < (sizeof(simde_uint8x16_t) / sizeof(*ptr)) * 4 ; i++) { @@ -425,9 +628,17 @@ simde_vld4q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_u16(ptr); #else simde_uint16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]), simde_uint16x8_from_private(a_[2]), simde_uint16x8_from_private(a_[3]) } }; return s_; @@ -445,9 +656,17 @@ simde_vld4q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_u32(ptr); #else simde_uint32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]), simde_uint32x4_from_private(a_[2]), simde_uint32x4_from_private(a_[3]) } }; return s_; @@ -465,9 +684,17 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_u64(ptr); #else simde_uint64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]), simde_uint64x2_from_private(a_[2]), simde_uint64x2_from_private(a_[3]) } }; return s_; @@ -478,6 +705,216 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #define vld4q_u64(a) simde_vld4q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_p8(ptr); + #else + simde_poly8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]), simde_poly8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_p8 + #define vld4_p8(a) simde_vld4_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_p16(ptr); + #else + simde_poly16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]), simde_poly16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_p16 + #define vld4_p16(a) simde_vld4_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld4_p64(ptr); + #else + simde_poly64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]), simde_poly64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_p64 + #define vld4_p64(a) simde_vld4_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4q_p8(ptr); + #else + simde_poly8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]), simde_poly8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_p8 + #define vld4q_p8(a) simde_vld4q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4q_p16(ptr); + #else + simde_poly16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]), simde_poly16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_p16 + #define vld4q_p16(a) simde_vld4q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_p64(ptr); + #else + simde_poly64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]), simde_poly64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_p64 + #define vld4q_p64(a) simde_vld4q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4_bf16(ptr); + #else + simde_bfloat16x4_private a_[4]; + for (size_t i = 0; i < (sizeof(simde_bfloat16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]), simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]), simde_bfloat16x4_from_private(a_[3]) } }; + return (s_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4_bf16 + #define vld4_bf16(a) simde_vld4_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4q_bf16(ptr); + #else + simde_bfloat16x8_private a_[4]; + for (size_t i = 0; i < (sizeof(simde_bfloat16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]), simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]), simde_bfloat16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4q_bf16 + #define vld4q_bf16(a) simde_vld4q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/ld4_dup.h b/arm/neon/ld4_dup.h new file mode 100644 index 000000000..a974b4fd5 --- /dev/null +++ b/arm/neon/ld4_dup.h @@ -0,0 +1,617 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD4_DUP_H) +#define SIMDE_ARM_NEON_LD4_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4_dup_f16(ptr); + #else + simde_float16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4_dup_f16 + #define vld4_dup_f16(a) simde_vld4_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x4_t +simde_vld4_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_f32(ptr); + #else + simde_float32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_f32 + #define vld4_dup_f32(a) simde_vld4_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x4_t +simde_vld4_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4_dup_f64(ptr); + #else + simde_float64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_f64 + #define vld4_dup_f64(a) simde_vld4_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x4_t +simde_vld4_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s8(ptr); + #else + simde_int8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s8 + #define vld4_dup_s8(a) simde_vld4_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x4_t +simde_vld4_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s16(ptr); + #else + simde_int16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s16 + #define vld4_dup_s16(a) simde_vld4_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x4_t +simde_vld4_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s32(ptr); + #else + simde_int32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s32 + #define vld4_dup_s32(a) simde_vld4_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x4_t +simde_vld4_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s64(ptr); + #else + simde_int64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s64 + #define vld4_dup_s64(a) simde_vld4_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x4_t +simde_vld4_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u8(ptr); + #else + simde_uint8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u8 + #define vld4_dup_u8(a) simde_vld4_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x4_t +simde_vld4_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u16(ptr); + #else + simde_uint16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u16 + #define vld4_dup_u16(a) simde_vld4_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x4_t +simde_vld4_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u32(ptr); + #else + simde_uint32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u32 + #define vld4_dup_u32(a) simde_vld4_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x4_t +simde_vld4_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u64(ptr); + #else + simde_uint64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u64 + #define vld4_dup_u64(a) simde_vld4_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4q_dup_f16(ptr); + #else + simde_float16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4q_dup_f16 + #define vld4q_dup_f16(a) simde_vld4q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x4_t +simde_vld4q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_f32(ptr); + #else + simde_float32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_f32 + #define vld4q_dup_f32(a) simde_vld4q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x4_t +simde_vld4q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_f64(ptr); + #else + simde_float64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_f64 + #define vld4q_dup_f64(a) simde_vld4q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x4_t +simde_vld4q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s8(ptr); + #else + simde_int8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s8 + #define vld4q_dup_s8(a) simde_vld4q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x4_t +simde_vld4q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s16(ptr); + #else + simde_int16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s16 + #define vld4q_dup_s16(a) simde_vld4q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x4_t +simde_vld4q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s32(ptr); + #else + simde_int32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s32 + #define vld4q_dup_s32(a) simde_vld4q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x4_t +simde_vld4q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s64(ptr); + #else + simde_int64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s64 + #define vld4q_dup_s64(a) simde_vld4q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x4_t +simde_vld4q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u8(ptr); + #else + simde_uint8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u8 + #define vld4q_dup_u8(a) simde_vld4q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x4_t +simde_vld4q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u16(ptr); + #else + simde_uint16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u16 + #define vld4q_dup_u16(a) simde_vld4q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x4_t +simde_vld4q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u32(ptr); + #else + simde_uint32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u32 + #define vld4q_dup_u32(a) simde_vld4q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x4_t +simde_vld4q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u64(ptr); + #else + simde_uint64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u64 + #define vld4q_dup_u64(a) simde_vld4q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4_dup_p8(ptr); + #else + simde_poly8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_p8 + #define vld4_dup_p8(a) simde_vld4_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4_dup_p16(ptr); + #else + simde_poly16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld4_dup_p16 + #define vld4_dup_p16(a) simde_vld4_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld4_dup_p64(ptr); + #else + simde_poly64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_p64 + #define vld4_dup_p64(a) simde_vld4_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4q_dup_p8(ptr); + #else + simde_poly8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld4q_dup_p8 + #define vld4q_dup_p8(a) simde_vld4q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4q_dup_p16(ptr); + #else + simde_poly16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) + #undef vld4q_dup_p16 + #define vld4q_dup_p16(a) simde_vld4q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_p64(ptr); + #else + simde_poly64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_p64 + #define vld4q_dup_p64(a) simde_vld4q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4_dup_bf16(ptr); + #else + simde_bfloat16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4_dup_bf16 + #define vld4_dup_bf16(a) simde_vld4_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4q_dup_bf16(ptr); + #else + simde_bfloat16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4q_dup_bf16 + #define vld4q_dup_bf16(a) simde_vld4q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */ diff --git a/arm/neon/ld4_lane.h b/arm/neon/ld4_lane.h index c525755d2..cdcf079d4 100644 --- a/arm/neon/ld4_lane.h +++ b/arm/neon/ld4_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* In older versions of clang, __builtin_neon_vld4_lane_v would @@ -99,6 +100,7 @@ simde_vld4_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4x4_t #define vld4_lane_s16(ptr, src, lane) simde_vld4_lane_s16((ptr), (src), (lane)) #endif + SIMDE_FUNCTION_ATTRIBUTES simde_int32x2x4_t simde_vld4_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x4_t src, const int lane) @@ -261,6 +263,34 @@ simde_vld4_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_ #define vld4_lane_u64(ptr, src, lane) simde_vld4_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) + #define simde_vld4_lane_f16(ptr, src, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4_lane_f16(ptr, src, lane)) + #else + #define simde_vld4_lane_f16(ptr, src, lane) vld4_lane_f16(ptr, src, lane) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4_lane_f16 + #define vld4_lane_f16(ptr, src, lane) simde_vld4_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x4_t simde_vld4_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x4_t src, const int lane) @@ -531,6 +561,34 @@ simde_vld4q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x4 #define vld4q_lane_u64(ptr, src, lane) simde_vld4q_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) + #define simde_vld4q_lane_f16(ptr, src, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4q_lane_f16(ptr, src, lane)) + #else + #define simde_vld4q_lane_f16(ptr, src, lane) vld4q_lane_f16(ptr, src, lane) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vld4q_lane_f16 + #define vld4q_lane_f16(ptr, src, lane) simde_vld4q_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x4_t simde_vld4q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4x4_t src, const int lane) @@ -585,6 +643,184 @@ simde_vld4q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_flo #define vld4q_lane_f64(ptr, src, lane) simde_vld4q_lane_f64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4_lane_p8(ptr, src, lane) vld4_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p8 + #define vld4_lane_p8(ptr, src, lane) simde_vld4_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4_lane_p16(ptr, src, lane) vld4_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p16 + #define vld4_lane_p16(ptr, src, lane) simde_vld4_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4_lane_p64(ptr, src, lane) vld4_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p64 + #define vld4_lane_p64(ptr, src, lane) simde_vld4_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x16x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4q_lane_p8(ptr, src, lane) vld4q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p8 + #define vld4q_lane_p8(ptr, src, lane) simde_vld4q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4q_lane_p16(ptr, src, lane) vld4q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p16 + #define vld4q_lane_p16(ptr, src, lane) simde_vld4q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4q_lane_p64(ptr, src, lane) vld4q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p64 + #define vld4q_lane_p64(ptr, src, lane) simde_vld4q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld4_lane_bf16(ptr, src, lane) vld4_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4_lane_bf16 + #define vld4_lane_bf16(ptr, src, lane) simde_vld4_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld4q_lane_bf16(ptr, src, lane) vld4q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vld4q_lane_bf16 + #define vld4q_lane_bf16(ptr, src, lane) simde_vld4q_lane_bf16((ptr), (src), (lane)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/max.h b/arm/neon/max.h index 1e2b449e3..c18bc2138 100644 --- a/arm/neon/max.h +++ b/arm/neon/max.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su */ #if !defined(SIMDE_ARM_NEON_MAX_H) @@ -36,6 +38,54 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxh_f16(a, b); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + #if !defined(SIMDE_FAST_NANS) + r_ = (a_ >= b_) ? a_ : ((a_ < b_) ? b_ : SIMDE_MATH_NANF); + #else + r_ = (a_ > b_) ? a_ : b_; + #endif + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxh_f16 + #define vmaxh_f16(a, b) simde_vmaxh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmax_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmax_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmax_f16 + #define vmax_f16(a, b) simde_vmax_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -47,14 +97,27 @@ simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if defined(SIMDE_RISCV_V_NATIVE) #if !defined(SIMDE_FAST_NANS) - r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF); + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv64 , 2) , 512 , 2); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 2); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 2); + r_.sv64 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv64 , b_.sv64 , 2); + r_.sv64 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv64 , vab_mask , 2); #else - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + r_.sv64 = __riscv_vfmax_vv_f32m1(a_.sv64, b_.sv64, 2); #endif - } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if !defined(SIMDE_FAST_NANS) + r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF); + #else + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + #endif + } + #endif return simde_float32x2_from_private(r_); #endif @@ -75,14 +138,28 @@ simde_vmax_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if defined(SIMDE_RISCV_V_NATIVE) #if !defined(SIMDE_FAST_NANS) - r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN); + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv64 , 1) , 512 , 1); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 1); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 1); + r_.sv64 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv64 , b_.sv64 , 1); + r_.sv64 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv64, vab_mask , 1); #else - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + r_.sv64 = __riscv_vfmax_vv_f64m1(a_.sv64, b_.sv64, 1); #endif - } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if !defined(SIMDE_FAST_NANS) + r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN); + #else + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + #endif + } + #endif return simde_float64x1_from_private(r_); #endif @@ -105,10 +182,14 @@ simde_vmax_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int8x8_from_private(r_); #endif @@ -131,10 +212,14 @@ simde_vmax_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int16x4_from_private(r_); #endif @@ -157,10 +242,14 @@ simde_vmax_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int32x2_from_private(r_); #endif @@ -181,10 +270,14 @@ simde_x_vmax_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int64x1_from_private(r_); #endif @@ -203,10 +296,14 @@ simde_vmax_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint8x8_from_private(r_); #endif @@ -232,6 +329,8 @@ simde_vmax_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ r_.m64 = _mm_add_pi16(b_.m64, _mm_subs_pu16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -260,10 +359,14 @@ simde_vmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint32x2_from_private(r_); #endif @@ -284,15 +387,44 @@ simde_x_vmax_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint64x1_from_private(r_); #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxq_f16 + #define vmaxq_f16(a, b) simde_vmaxq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -340,6 +472,17 @@ simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_max(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv128 , 4) , 512 , 4); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 4); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 4); + r_.sv128 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv128 , b_.sv128 , 4); + r_.sv128 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv128 , vab_mask , 4); + #else + r_.sv128 = __riscv_vfmax_vv_f32m1(a_.sv128, b_.sv128, 4); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -387,6 +530,18 @@ simde_vmaxq_f64(simde_float64x2_t a, simde_float64x2_t b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_max(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv128 , 2) , 512 , 2); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 2); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 2); + r_.sv128 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv128 , b_.sv128 , 2); + r_.sv128 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv128, vab_mask , 2); + #else + r_.sv128 = __riscv_vfmax_vv_f64m1(a_.sv128, b_.sv128, 2); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -430,6 +585,15 @@ simde_vmaxq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.v128 = wasm_i8x16_max(a_.v128, b_.v128); #endif + return simde_int8x16_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16); + return simde_int8x16_from_private(r_); #else return simde_vbslq_s8(simde_vcgtq_s8(a, b), a, b); @@ -461,6 +625,15 @@ simde_vmaxq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.v128 = wasm_i16x8_max(a_.v128, b_.v128); #endif + return simde_int16x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8); + return simde_int16x8_from_private(r_); #else return simde_vbslq_s16(simde_vcgtq_s16(a, b), a, b); @@ -492,6 +665,15 @@ simde_vmaxq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.v128 = wasm_i32x4_max(a_.v128, b_.v128); #endif + return simde_int32x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4); + return simde_int32x4_from_private(r_); #else return simde_vbslq_s32(simde_vcgtq_s32(a, b), a, b); @@ -507,6 +689,15 @@ simde_int64x2_t simde_x_vmaxq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_max(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128, b_.sv128, 2); + + return simde_int64x2_from_private(r_); #else return simde_vbslq_s64(simde_vcgtq_s64(a, b), a, b); #endif @@ -533,6 +724,15 @@ simde_vmaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.v128 = wasm_u8x16_max(a_.v128, b_.v128); #endif + return simde_uint8x16_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16); + return simde_uint8x16_from_private(r_); #else return simde_vbslq_u8(simde_vcgtq_u8(a, b), a, b); @@ -567,6 +767,15 @@ simde_vmaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.v128 = wasm_u16x8_max(a_.v128, b_.v128); #endif + return simde_uint16x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vbslq_u16(simde_vcgtq_u16(a, b), a, b); @@ -598,6 +807,15 @@ simde_vmaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.v128 = wasm_u32x4_max(a_.v128, b_.v128); #endif + return simde_uint32x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vbslq_u32(simde_vcgtq_u32(a, b), a, b); @@ -613,6 +831,15 @@ simde_uint64x2_t simde_x_vmaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_max(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u64m1(a_.sv128, b_.sv128, 2); + + return simde_uint64x2_from_private(r_); #else return simde_vbslq_u64(simde_vcgtq_u64(a, b), a, b); #endif diff --git a/arm/neon/maxnm.h b/arm/neon/maxnm.h index b9aceb02c..c8ed1b557 100644 --- a/arm/neon/maxnm.h +++ b/arm/neon/maxnm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MAXNM_H) @@ -35,10 +36,91 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmh_f16(a, b); + #else + #if defined(simde_math_fmaxf) + return simde_float16_from_float32(simde_math_fmaxf(simde_float16_to_float32(a), simde_float16_to_float32(b))); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_; + if (a_ > b_) { + r_ = a_; + } else if (a_ < b_) { + r_ = b_; + } else if (a_ == a_) { + r_ = a_; + } else { + r_ = b_; + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxnmh_f16 + #define vmaxnmh_f16(a, b) simde_vmaxnmh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnm_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxnm_f16 + #define vmaxnm_f16(a, b) simde_vmaxnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxnmq_f16 + #define vmaxnmq_f16(a, b) simde_vmaxnmq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) return vmaxnm_f32(a, b); #else simde_float32x2_private @@ -66,7 +148,8 @@ simde_vmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vmaxnm_f32 #define vmaxnm_f32(a, b) simde_vmaxnm_f32((a), (b)) #endif @@ -110,7 +193,7 @@ simde_vmaxnm_f64(simde_float64x1_t a, simde_float64x1_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) return vmaxnmq_f32(a, b); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_max(a, b); @@ -154,7 +237,8 @@ simde_vmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vmaxnmq_f32 #define vmaxnmq_f32(a, b) simde_vmaxnmq_f32((a), (b)) #endif diff --git a/arm/neon/maxnmv.h b/arm/neon/maxnmv.h new file mode 100644 index 000000000..6ca95e08e --- /dev/null +++ b/arm/neon/maxnmv.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_MAXNMV_H) +#define SIMDE_ARM_NEON_MAXNMV_H + +#include "types.h" +#include + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmaxnmv_f32(simde_float32x2_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmv_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a_.sv64, \ + __riscv_vfmv_v_f_f32m1(r, 2), 2)); + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmv_f32 + #define vmaxnmv_f32(v) simde_vmaxnmv_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmaxnmvq_f32(simde_float32x4_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmvq_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a_.sv128, \ + __riscv_vfmv_v_f_f32m1(r, 4), 4)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && HEDLEY_HAS_BUILTIN(__builtin_reduce_max) + simde_float32_t rst = __builtin_reduce_max(a_.values); + r = (rst > r) ? rst : r; + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmvq_f32 + #define vmaxnmvq_f32(v) simde_vmaxnmvq_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmaxnmvq_f64(simde_float64x2_t a) { + simde_float64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmvq_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + + r = -SIMDE_MATH_INFINITY; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a_.sv128, \ + __riscv_vfmv_v_f_f64m1(r, 2), 2)); + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmvq_f64 + #define vmaxnmvq_f64(v) simde_vmaxnmvq_f64(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmv_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmax_vs_f16m1_f16m1(a_.sv64, \ + __riscv_vfmv_v_f_f16m1(SIMDE_NINFINITYHF, 4), 4)); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxnmv_f16 + #define vmaxnmv_f16(v) simde_vmaxnmv_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmvq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmax_vs_f16m1_f16m1(a_.sv128, \ + __riscv_vfmv_v_f_f16m1(SIMDE_NINFINITYHF, 8), 8)); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxnmvq_f16 + #define vmaxnmvq_f16(v) simde_vmaxnmvq_f16(v) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MAXNMV_H) */ diff --git a/arm/neon/maxv.h b/arm/neon/maxv.h index 37437b04d..bd545ad6c 100644 --- a/arm/neon/maxv.h +++ b/arm/neon/maxv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MAXV_H) @@ -34,6 +35,39 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxv_f16(a); + #else + simde_float32_t r; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + r = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 > r ? a32 : r; + #else + r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxv_f16 + #define vmaxv_f16(v) simde_vmaxv_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmaxv_f32(simde_float32x2_t a) { @@ -202,6 +236,39 @@ simde_vmaxv_u32(simde_uint32x2_t a) { #define vmaxv_u32(v) simde_vmaxv_u32(v) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxvq_f16(a); + #else + simde_float32_t r; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + r = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 > r ? a32 : r; + #else + r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmaxvq_f16 + #define vmaxvq_f16(v) simde_vmaxvq_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmaxvq_f32(simde_float32x4_t a) { diff --git a/arm/neon/min.h b/arm/neon/min.h index 08ea4d003..bcd201a36 100644 --- a/arm/neon/min.h +++ b/arm/neon/min.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MIN_H) @@ -36,11 +37,77 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminh_f16(a, b); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + #if !defined(SIMDE_FAST_NANS) + r_ = (a_ <= b_) ? a_ : ((a_ > b_) ? b_ : SIMDE_MATH_NANF); + #else + r_ = (a_ < b_) ? a_ : b_; + #endif + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminh_f16 + #define vminh_f16(a, b) simde_vminh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmin_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmin_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vmin_f16 + #define vmin_f16(a, b) simde_vmin_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmin_f32(simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmin_f32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv64 , 2) , 512 , 2); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 2); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 2); + r_.sv64 = __riscv_vfmin_vv_f32m1_m(vab_mask , a_.sv64 , b_.sv64 , 2); + r_.sv64 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv64 , vab_mask , 2); + #else + r_.sv64 = __riscv_vfmin_vv_f32m1(a_.sv64, b_.sv64, 2); + #endif + + return simde_float32x2_from_private(r_); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(64) simde_float32x2_t r = simde_vbsl_f32(simde_vcgt_f32(b, a), a, b); @@ -83,6 +150,24 @@ simde_float64x1_t simde_vmin_f64(simde_float64x1_t a, simde_float64x1_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmin_f64(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv64 , 1) , 512 , 1); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 1); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 1); + r_.sv64 = __riscv_vfmin_vv_f64m1_m(vab_mask , a_.sv64 , b_.sv64 , 1); + r_.sv64 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv64, vab_mask , 1); + #else + r_.sv64 = __riscv_vfmin_vv_f64m1(a_.sv64, b_.sv64, 1); + #endif + + return simde_float64x1_from_private(r_); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(64) simde_float64x1_t r = simde_vbsl_f64(simde_vcgt_f64(b, a), a, b); @@ -133,10 +218,14 @@ simde_vmin_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int8x8_from_private(r_); #endif @@ -159,8 +248,8 @@ simde_vmin_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_X86_MMX_NATIVE) - r_.m64 = _mm_sub_pi16(a_.m64, _mm_subs_pu16(b_.m64)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -189,10 +278,14 @@ simde_vmin_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int32x2_from_private(r_); #endif @@ -213,10 +306,14 @@ simde_x_vmin_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int64x1_from_private(r_); #endif @@ -235,10 +332,14 @@ simde_vmin_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint8x8_from_private(r_); #endif @@ -264,6 +365,8 @@ simde_vmin_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ r_.m64 = _mm_sub_pi16(a_.m64, _mm_subs_pu16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -292,10 +395,14 @@ simde_vmin_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint32x2_from_private(r_); #endif @@ -316,14 +423,43 @@ simde_x_vmin_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif + + return simde_uint64x1_from_private(r_); + #endif +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vminq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]); } - return simde_uint64x1_from_private(r_); + return simde_float16x8_from_private(r_); #endif } +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminq_f16 + #define vminq_f16(a, b) simde_vminq_f16((a), (b)) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t @@ -344,6 +480,17 @@ simde_vminq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_blendv_ps(_mm_set1_ps(SIMDE_MATH_NANF), _mm_min_ps(a_.m128, b_.m128), _mm_cmpord_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv128 , 4) , 512 , 4); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 4); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 4); + r_.sv128 = __riscv_vfmin_vv_f32m1_m(vab_mask , a_.sv128 , b_.sv128 , 4); + r_.sv128 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv128 , vab_mask , 4); + #else + r_.sv128 = __riscv_vfmin_vv_f32m1(a_.sv128, b_.sv128, 4); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -388,6 +535,18 @@ simde_vminq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_blendv_pd(_mm_set1_pd(SIMDE_MATH_NAN), _mm_min_pd(a_.m128d, b_.m128d), _mm_cmpord_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv128 , 2) , 512 , 2); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 2); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 2); + r_.sv128 = __riscv_vfmin_vv_f64m1_m(vab_mask , a_.sv128 , b_.sv128 , 2); + r_.sv128 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv128, vab_mask , 2); + #else + r_.sv128 = __riscv_vfmin_vv_f64m1(a_.sv128, b_.sv128, 2); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -430,6 +589,8 @@ simde_vminq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_min_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -462,6 +623,8 @@ simde_vminq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_min_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -494,6 +657,8 @@ simde_vminq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_min_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -522,6 +687,8 @@ simde_x_vminq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm_min_epi64(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i64m1(a_.sv128, b_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -550,6 +717,8 @@ simde_vminq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_min_epu8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -585,6 +754,8 @@ simde_vminq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_sub_epi16(a_.m128i, _mm_subs_epu16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -640,6 +811,8 @@ simde_vminq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -666,10 +839,14 @@ simde_x_vminq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u64m1(a_.sv128, b_.sv128, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint64x2_from_private(r_); #endif diff --git a/arm/neon/minnm.h b/arm/neon/minnm.h index b68a28cb7..a72a01774 100644 --- a/arm/neon/minnm.h +++ b/arm/neon/minnm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MINNM_H) @@ -35,10 +36,66 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnmh_f16(a, b); + #else + #if defined(simde_math_fminf) + return simde_float16_from_float32(simde_math_fminf(simde_float16_to_float32(a), simde_float16_to_float32(b))); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_; + if (a_ < b_) { + r_ = a_; + } else if (a_ > b_) { + r_ = b_; + } else if (a_ == a_) { + r_ = a_; + } else { + r_ = b_; + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vminnmh_f16 + #define vminnmh_f16(a, b) simde_vminnmh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnm_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vminnm_f16 + #define vminnm_f16(a, b) simde_vminnm_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) return vminnm_f32(a, b); #else simde_float32x2_private @@ -66,7 +123,8 @@ simde_vminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vminnm_f32 #define vminnm_f32(a, b) simde_vminnm_f32((a), (b)) #endif @@ -107,10 +165,35 @@ simde_vminnm_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vminnm_f64(a, b) simde_vminnm_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnmq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) + #undef vminnmq_f16 + #define vminnmq_f16(a, b) simde_vminnmq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) return vminnmq_f32(a, b); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_FAST_NANS) return simde_vbslq_f32(simde_vcleq_f32(a, b), a, b); @@ -155,7 +238,8 @@ simde_vminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vminnmq_f32 #define vminnmq_f32(a, b) simde_vminnmq_f32((a), (b)) #endif diff --git a/arm/neon/minnmv.h b/arm/neon/minnmv.h new file mode 100644 index 000000000..648e8d90e --- /dev/null +++ b/arm/neon/minnmv.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_MINNMV_H) +#define SIMDE_ARM_NEON_MINNMV_H + +#include "types.h" +#include + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminnmv_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmin_vs_f16m1_f16m1(a_.sv64, \ + __riscv_vfmv_v_f_f16m1(SIMDE_INFINITYHF, 4), 4)); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); + + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminnmv_f16 + #define vminnmv_f16(v) simde_vminnmv_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vminnmv_f32(simde_float32x2_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmv_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + + r = SIMDE_MATH_INFINITYF; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a_.sv64, \ + __riscv_vfmv_v_f_f32m1(r, 2), 2)); + #else + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmv_f32 + #define vminnmv_f32(v) simde_vminnmv_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminnmvq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmin_vs_f16m1_f16m1(a_.sv128, \ + __riscv_vfmv_v_f_f16m1(SIMDE_INFINITYHF, 8), 8)); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminnmvq_f16 + #define vminnmvq_f16(v) simde_vminnmvq_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vminnmvq_f32(simde_float32x4_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmvq_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + + r = SIMDE_MATH_INFINITYF; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a_.sv128, \ + __riscv_vfmv_v_f_f32m1(r, 4), 4)); + #else + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmvq_f32 + #define vminnmvq_f32(v) simde_vminnmvq_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vminnmvq_f64(simde_float64x2_t a) { + simde_float64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmvq_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + + r = SIMDE_MATH_INFINITY; + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a_.sv128, \ + __riscv_vfmv_v_f_f64m1(r, 2), 2)); + #else + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmvq_f64 + #define vminnmvq_f64(v) simde_vminnmvq_f64(v) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MINNMV_H) */ diff --git a/arm/neon/minv.h b/arm/neon/minv.h index 93028d74f..3ab62a703 100644 --- a/arm/neon/minv.h +++ b/arm/neon/minv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MINV_H) @@ -34,6 +35,39 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminv_f16(a); + #else + simde_float32_t r; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + r = simde_float16_to_float32(SIMDE_INFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 < r ? a32 : r; + #else + r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminv_f16 + #define vminv_f16(v) simde_vminv_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vminv_f32(simde_float32x2_t a) { @@ -210,6 +244,39 @@ simde_vminv_u32(simde_uint32x2_t a) { #define vminv_u32(v) simde_vminv_u32(v) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminvq_f16(a); + #else + simde_float32_t r; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + r = simde_float16_to_float32(SIMDE_INFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 < r ? a32 : r; + #else + r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vminvq_f16 + #define vminvq_f16(v) simde_vminvq_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vminvq_f32(simde_float32x4_t a) { diff --git a/arm/neon/mla.h b/arm/neon/mla.h index 4c57edaf6..aaf24a02b 100644 --- a/arm/neon/mla.h +++ b/arm/neon/mla.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLA_H) @@ -41,6 +42,15 @@ simde_float32x2_t simde_vmla_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vadd_f32(simde_vmul_f32(b, c), a); #endif @@ -55,6 +65,15 @@ simde_float64x1_t simde_vmla_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmla_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vadd_f64(simde_vmul_f64(b, c), a); #endif @@ -69,6 +88,15 @@ simde_int8x8_t simde_vmla_s8(simde_int8x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b), + c_ = simde_int8x8_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_int8x8_from_private(r_); #else return simde_vadd_s8(simde_vmul_s8(b, c), a); #endif @@ -83,6 +111,15 @@ simde_int16x4_t simde_vmla_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vadd_s16(simde_vmul_s16(b, c), a); #endif @@ -97,6 +134,15 @@ simde_int32x2_t simde_vmla_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vadd_s32(simde_vmul_s32(b, c), a); #endif @@ -111,6 +157,15 @@ simde_uint8x8_t simde_vmla_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b), + c_ = simde_uint8x8_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_uint8x8_from_private(r_); #else return simde_vadd_u8(simde_vmul_u8(b, c), a); #endif @@ -125,6 +180,15 @@ simde_uint16x4_t simde_vmla_u16(simde_uint16x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b), + c_ = simde_uint16x4_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vadd_u16(simde_vmul_u16(b, c), a); #endif @@ -139,6 +203,15 @@ simde_uint32x2_t simde_vmla_u32(simde_uint32x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b), + c_ = simde_uint32x2_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vadd_u32(simde_vmul_u32(b, c), a); #endif @@ -156,7 +229,7 @@ simde_vmlaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), @@ -165,6 +238,8 @@ simde_vmlaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128 = _mm_fmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); #endif return simde_float32x4_from_private(r_); @@ -185,7 +260,7 @@ simde_vmlaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), @@ -194,6 +269,8 @@ simde_vmlaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128d = _mm_fmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); #endif return simde_float64x2_from_private(r_); @@ -211,6 +288,15 @@ simde_int8x16_t simde_vmlaq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_int8x16_from_private(r_); #else return simde_vaddq_s8(simde_vmulq_s8(b, c), a); #endif @@ -225,6 +311,15 @@ simde_int16x8_t simde_vmlaq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmulq_s16(b, c), a); #endif @@ -239,6 +334,15 @@ simde_int32x4_t simde_vmlaq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmulq_s32(b, c), a); #endif @@ -253,6 +357,15 @@ simde_uint8x16_t simde_vmlaq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_uint8x16_from_private(r_); #else return simde_vaddq_u8(simde_vmulq_u8(b, c), a); #endif @@ -267,6 +380,15 @@ simde_uint16x8_t simde_vmlaq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmulq_u16(b, c), a); #endif @@ -281,6 +403,15 @@ simde_uint32x4_t simde_vmlaq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmulq_u32(b, c), a); #endif diff --git a/arm/neon/mla_lane.h b/arm/neon/mla_lane.h new file mode 100644 index 000000000..ad383d473 --- /dev/null +++ b/arm/neon/mla_lane.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLA_LANE_H) +#define SIMDE_ARM_NEON_MLA_LANE_H + +#include "mla.h" +#include "dup_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_f32(a, b, v, lane) vmla_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_f32 + #define vmla_lane_f32(a, b, v, lane) simde_vmla_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_f32(a, b, v, lane) vmla_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_f32 + #define vmla_laneq_f32(a, b, v, lane) simde_vmla_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_f32(a, b, v, lane) vmlaq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_f32 + #define vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_s16(a, b, v, lane) vmla_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_s16 + #define vmla_lane_s16(a, b, v, lane) simde_vmla_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_s16(a, b, v, lane) vmla_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_s16 + #define vmla_laneq_s16(a, b, v, lane) simde_vmla_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_s16(a, b, v, lane) vmlaq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_s16 + #define vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_s32(a, b, v, lane) vmla_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_s32 + #define vmla_lane_s32(a, b, v, lane) simde_vmla_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_s32(a, b, v, lane) vmla_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_s32 + #define vmla_laneq_s32(a, b, v, lane) simde_vmla_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_s32(a, b, v, lane) vmlaq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_s32 + #define vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_u16(a, b, v, lane) vmla_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_u16 + #define vmla_lane_u16(a, b, v, lane) simde_vmla_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_u16(a, b, v, lane) vmla_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_u16 + #define vmla_laneq_u16(a, b, v, lane) simde_vmla_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_u16(a, b, v, lane) vmlaq_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_u16 + #define vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_u32(a, b, v, lane) vmla_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_u32 + #define vmla_lane_u32(a, b, v, lane) simde_vmla_lane_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_u32(a, b, v, lane) vmla_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_u32 + #define vmla_laneq_u32(a, b, v, lane) simde_vmla_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_u32(a, b, v, lane) vmlaq_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_u32 + #define vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_f32(a, b, v, lane) vmlaq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_f32 + #define vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_s16(a, b, v, lane) vmlaq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_s16 + #define vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_s32(a, b, v, lane) vmlaq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_s32 + #define vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_u16(a, b, v, lane) vmlaq_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_u16 + #define vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_u32(a, b, v, lane) vmlaq_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_u32 + #define vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLA_LANE_H) */ diff --git a/arm/neon/mla_n.h b/arm/neon/mla_n.h index f4521eb5f..ecb726d9b 100644 --- a/arm/neon/mla_n.h +++ b/arm/neon/mla_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLA_N_H) @@ -48,7 +49,9 @@ simde_vmla_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32 c) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmacc_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -76,7 +79,9 @@ simde_vmla_n_s16(simde_int16x4_t a, simde_int16x4_t b, int16_t c) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_i16m1(a_.sv64 , c , b_.sv64 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -104,7 +109,9 @@ simde_vmla_n_s32(simde_int32x2_t a, simde_int32x2_t b, int32_t c) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_i32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -132,7 +139,9 @@ simde_vmla_n_u16(simde_uint16x4_t a, simde_uint16x4_t b, uint16_t c) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_u16m1(a_.sv64 , c , b_.sv64 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -160,7 +169,9 @@ simde_vmla_n_u32(simde_uint32x2_t a, simde_uint32x2_t b, uint32_t c) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_u32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -182,7 +193,7 @@ simde_float32x4_t simde_vmlaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_f32(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_f32(simde_vmulq_n_f32(b, c), a); #else simde_float32x4_private @@ -190,7 +201,9 @@ simde_vmlaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -212,7 +225,7 @@ simde_int16x8_t simde_vmlaq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_s16(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(simde_vmulq_n_s16(b, c), a); #else simde_int16x8_private @@ -220,7 +233,9 @@ simde_vmlaq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_i16m1(a_.sv128 , c , b_.sv128 , 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -242,7 +257,7 @@ simde_int32x4_t simde_vmlaq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_s32(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(simde_vmulq_n_s32(b, c), a); #else simde_int32x4_private @@ -250,7 +265,9 @@ simde_vmlaq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_i32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -272,7 +289,7 @@ simde_uint16x8_t simde_vmlaq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_u16(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(simde_vmulq_n_u16(b, c), a); #else simde_uint16x8_private @@ -280,7 +297,9 @@ simde_vmlaq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_u16m1(a_.sv128 , c , b_.sv128 , 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -310,7 +329,9 @@ simde_vmlaq_n_u32(simde_uint32x4_t a, simde_uint32x4_t b, uint32_t c) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_u32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE diff --git a/arm/neon/mlal.h b/arm/neon/mlal.h index 0403b81a8..594fc26e3 100644 --- a/arm/neon/mlal.h +++ b/arm/neon/mlal.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_H) @@ -41,6 +42,15 @@ simde_int16x8_t simde_vmlal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i16m1(a_.sv128 , vb , vc , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlaq_s16(a, simde_vmovl_s8(b), simde_vmovl_s8(c)); #endif @@ -55,6 +65,15 @@ simde_int32x4_t simde_vmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i32m1(a_.sv128 , vb , vc , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_s16(b), simde_vmovl_s16(c)); #endif @@ -69,6 +88,15 @@ simde_int64x2_t simde_vmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i64m1(a_.sv128 , vb , vc , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -98,6 +126,15 @@ simde_uint16x8_t simde_vmlal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u16m1(a_.sv128 , vb , vc , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlaq_u16(a, simde_vmovl_u8(b), simde_vmovl_u8(c)); #endif @@ -112,6 +149,15 @@ simde_uint32x4_t simde_vmlal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u32m1(a_.sv128 , vb , vc , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_u16(b), simde_vmovl_u16(c)); #endif @@ -126,6 +172,15 @@ simde_uint64x2_t simde_vmlal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u64m1(a_.sv128 , vb , vc , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/arm/neon/mlal_high.h b/arm/neon/mlal_high.h index f7222d16f..21e7221ce 100644 --- a/arm/neon/mlal_high.h +++ b/arm/neon/mlal_high.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H) @@ -41,6 +42,15 @@ simde_int16x8_t simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + simde_int8x16_private c_ = simde_int8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_i8m1(c_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwmacc_vv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c)); #endif @@ -55,6 +65,15 @@ simde_int32x4_t simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + simde_int16x8_private c_ = simde_int16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_i16m1(c_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmacc_vv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)); #endif @@ -69,6 +88,15 @@ simde_int64x2_t simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + simde_int32x4_private c_ = simde_int32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_i32m1(c_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmacc_vv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv128) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -98,6 +126,15 @@ simde_uint16x8_t simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + simde_uint8x16_private c_ = simde_uint8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_u8m1(c_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwmaccu_vv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c)); #endif @@ -112,6 +149,15 @@ simde_uint32x4_t simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + simde_uint16x8_private c_ = simde_uint16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_u16m1(c_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmaccu_vv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c)); #endif @@ -126,6 +172,15 @@ simde_uint64x2_t simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + simde_uint32x4_private c_ = simde_uint32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_u32m1(c_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmaccu_vv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/arm/neon/mlal_high_lane.h b/arm/neon/mlal_high_lane.h new file mode 100644 index 000000000..50018a95d --- /dev/null +++ b/arm/neon/mlal_high_lane.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MLAL_HIGH_LANE_H + +#include "movl_high.h" +#include "mlal_high.h" +#include "dup_n.h" +#include "mla.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_s16(a, b, v, lane) vmlal_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_s16 + #define vmlal_high_lane_s16(a, b, v, lane) simde_vmlal_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_s16(a, b, v, lane) vmlal_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_s16 + #define vmlal_high_laneq_s16(a, b, v, lane) simde_vmlal_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_s32(a, b, v, lane) vmlal_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_s32 + #define vmlal_high_lane_s32(a, b, v, lane) simde_vmlal_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_s32(a, b, v, lane) vmlal_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_s32 + #define vmlal_high_laneq_s32(a, b, v, lane) simde_vmlal_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlal_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_u16(a, b, v, lane) vmlal_high_lane_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_u16 + #define vmlal_high_lane_u16(a, b, v, lane) simde_vmlal_high_lane_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlal_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_u16(a, b, v, lane) vmlal_high_laneq_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_u16 + #define vmlal_high_laneq_u16(a, b, v, lane) simde_vmlal_high_laneq_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlal_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_u32(a, b, v, lane) vmlal_high_lane_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_u32 + #define vmlal_high_lane_u32(a, b, v, lane) simde_vmlal_high_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlal_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_u32(a, b, v, lane) vmlal_high_laneq_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_u32 + #define vmlal_high_laneq_u32(a, b, v, lane) simde_vmlal_high_laneq_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H) */ diff --git a/arm/neon/mlal_high_n.h b/arm/neon/mlal_high_n.h index 0c26174ec..876c19333 100644 --- a/arm/neon/mlal_high_n.h +++ b/arm/neon/mlal_high_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H) @@ -41,6 +42,13 @@ simde_int32x4_t simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmacc_vx_i32m1(a_.sv128 , c , __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,6 +63,13 @@ simde_int64x2_t simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmacc_vx_i64m1(a_.sv128 , c , __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -84,6 +99,13 @@ simde_uint32x4_t simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmaccu_vx_u32m1(a_.sv128 , c , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +120,13 @@ simde_uint64x2_t simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmaccu_vx_u64m1(a_.sv128 , c , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/arm/neon/mlal_n.h b/arm/neon/mlal_n.h index 6025492d2..6b585c58a 100644 --- a/arm/neon/mlal_n.h +++ b/arm/neon/mlal_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_N_H) @@ -41,6 +42,13 @@ simde_int32x4_t simde_vmlal_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vwmacc_vx_i32m1(a_.sv128 , c , vb , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,13 +63,19 @@ simde_int64x2_t simde_vmlal_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vwmacc_vx_i64m1(a_.sv128 , c , vb , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(simde_vmovl_s32(b)), c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c)); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = (b_.values * c_.values) + a_.values; #else @@ -84,6 +98,13 @@ simde_uint32x4_t simde_vmlal_n_u16(simde_uint32x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vwmaccu_vx_u32m1(a_.sv128 , c , vb , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +119,13 @@ simde_uint64x2_t simde_vmlal_n_u32(simde_uint64x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vwmaccu_vx_u64m1(a_.sv128 , c , vb , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/arm/neon/mls.h b/arm/neon/mls.h index 83fb42fc7..cff43085f 100644 --- a/arm/neon/mls.h +++ b/arm/neon/mls.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLS_H) @@ -39,6 +40,14 @@ simde_float32x2_t simde_vmls_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vsub_f32(a, simde_vmul_f32(b, c)); #endif @@ -53,6 +62,14 @@ simde_float64x1_t simde_vmls_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmls_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vsub_f64(a, simde_vmul_f64(b, c)); #endif @@ -67,6 +84,14 @@ simde_int8x8_t simde_vmls_s8(simde_int8x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b), + c_ = simde_int8x8_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_int8x8_from_private(r_); #else return simde_vsub_s8(a, simde_vmul_s8(b, c)); #endif @@ -81,6 +106,14 @@ simde_int16x4_t simde_vmls_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vsub_s16(a, simde_vmul_s16(b, c)); #endif @@ -95,6 +128,14 @@ simde_int32x2_t simde_vmls_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vsub_s32(a, simde_vmul_s32(b, c)); #endif @@ -109,6 +150,14 @@ simde_uint8x8_t simde_vmls_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b), + c_ = simde_uint8x8_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_uint8x8_from_private(r_); #else return simde_vsub_u8(a, simde_vmul_u8(b, c)); #endif @@ -123,6 +172,14 @@ simde_uint16x4_t simde_vmls_u16(simde_uint16x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b), + c_ = simde_uint16x4_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vsub_u16(a, simde_vmul_u16(b, c)); #endif @@ -137,6 +194,14 @@ simde_uint32x2_t simde_vmls_u32(simde_uint32x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b), + c_ = simde_uint32x2_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vsub_u32(a, simde_vmul_u32(b, c)); #endif @@ -151,8 +216,7 @@ simde_float32x4_t simde_vmlsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_f32(a, b, c); - #elif \ - defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), @@ -161,6 +225,8 @@ simde_vmlsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfnmsac_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); #endif return simde_float32x4_from_private(r_); @@ -178,8 +244,7 @@ simde_float64x2_t simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsq_f64(a, b, c); - #elif \ - defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_X86_FMA_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), @@ -188,6 +253,8 @@ simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfnmsac_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); #endif return simde_float64x2_from_private(r_); @@ -195,7 +262,7 @@ simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { return simde_vsubq_f64(a, simde_vmulq_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vmlsq_f64 #define vmlsq_f64(a, b, c) simde_vmlsq_f64((a), (b), (c)) #endif @@ -205,6 +272,14 @@ simde_int8x16_t simde_vmlsq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_int8x16_from_private(r_); #else return simde_vsubq_s8(a, simde_vmulq_s8(b, c)); #endif @@ -219,6 +294,14 @@ simde_int16x8_t simde_vmlsq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmulq_s16(b, c)); #endif @@ -233,6 +316,14 @@ simde_int32x4_t simde_vmlsq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmulq_s32(b, c)); #endif @@ -247,6 +338,14 @@ simde_uint8x16_t simde_vmlsq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_uint8x16_from_private(r_); #else return simde_vsubq_u8(a, simde_vmulq_u8(b, c)); #endif @@ -261,6 +360,14 @@ simde_uint16x8_t simde_vmlsq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmulq_u16(b, c)); #endif @@ -275,6 +382,14 @@ simde_uint32x4_t simde_vmlsq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmulq_u32(b, c)); #endif diff --git a/arm/neon/mls_lane.h b/arm/neon/mls_lane.h new file mode 100644 index 000000000..35855a2b7 --- /dev/null +++ b/arm/neon/mls_lane.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLS_LANE_H) +#define SIMDE_ARM_NEON_MLS_LANE_H + +#include "mls.h" +#include "dup_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_f32(a, b, v, lane) vmls_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_f32 + #define vmls_lane_f32(a, b, v, lane) simde_vmls_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_f32(a, b, v, lane) vmls_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_f32 + #define vmls_laneq_f32(a, b, v, lane) simde_vmls_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_f32(a, b, v, lane) vmlsq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_f32 + #define vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_s16(a, b, v, lane) vmls_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_s16 + #define vmls_lane_s16(a, b, v, lane) simde_vmls_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_s16(a, b, v, lane) vmls_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_s16 + #define vmls_laneq_s16(a, b, v, lane) simde_vmls_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_s16(a, b, v, lane) vmlsq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_s16 + #define vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_s32(a, b, v, lane) vmls_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_s32 + #define vmls_lane_s32(a, b, v, lane) simde_vmls_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_s32(a, b, v, lane) vmls_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_s32 + #define vmls_laneq_s32(a, b, v, lane) simde_vmls_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_s32(a, b, v, lane) vmlsq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_s32 + #define vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_u16(a, b, v, lane) vmls_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_u16 + #define vmls_lane_u16(a, b, v, lane) simde_vmls_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_u16(a, b, v, lane) vmls_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_u16 + #define vmls_laneq_u16(a, b, v, lane) simde_vmls_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_u16(a, b, v, lane) vmlsq_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_u16 + #define vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_u32(a, b, v, lane) vmls_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_u32 + #define vmls_lane_u32(a, b, v, lane) simde_vmls_lane_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_u32(a, b, v, lane) vmls_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_u32 + #define vmls_laneq_u32(a, b, v, lane) simde_vmls_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_u32(a, b, v, lane) vmlsq_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_u32 + #define vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_f32(a, b, v, lane) vmlsq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_f32 + #define vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_s16(a, b, v, lane) vmlsq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_s16 + #define vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_s32(a, b, v, lane) vmlsq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_s32 + #define vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_u16(a, b, v, lane) vmlsq_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_u16 + #define vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_u32(a, b, v, lane) vmlsq_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_u32 + #define vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLS_LANE_H) */ diff --git a/arm/neon/mls_n.h b/arm/neon/mls_n.h index 2ff48e231..9a4239fe4 100644 --- a/arm/neon/mls_n.h +++ b/arm/neon/mls_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLS_N_H) @@ -40,6 +41,13 @@ simde_float32x2_t simde_vmls_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vmls_f32(a, b, simde_vdup_n_f32(c)); #endif @@ -54,6 +62,13 @@ simde_int16x4_t simde_vmls_n_s16(simde_int16x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_i16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vmls_s16(a, b, simde_vdup_n_s16(c)); #endif @@ -68,6 +83,13 @@ simde_int32x2_t simde_vmls_n_s32(simde_int32x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_i32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vmls_s32(a, b, simde_vdup_n_s32(c)); #endif @@ -82,6 +104,13 @@ simde_uint16x4_t simde_vmls_n_u16(simde_uint16x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_u16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vmls_u16(a, b, simde_vdup_n_u16(c)); #endif @@ -96,6 +125,13 @@ simde_uint32x2_t simde_vmls_n_u32(simde_uint32x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_u32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vmls_u32(a, b, simde_vdup_n_u32(c)); #endif @@ -110,6 +146,13 @@ simde_float32x4_t simde_vmlsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_float32x4_from_private(r_); #else return simde_vmlsq_f32(a, b, simde_vdupq_n_f32(c)); #endif @@ -124,6 +167,13 @@ simde_int16x8_t simde_vmlsq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_i16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlsq_s16(a, b, simde_vdupq_n_s16(c)); #endif @@ -138,6 +188,13 @@ simde_int32x4_t simde_vmlsq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_i32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlsq_s32(a, b, simde_vdupq_n_s32(c)); #endif @@ -152,6 +209,13 @@ simde_uint16x8_t simde_vmlsq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_u16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlsq_u16(a, b, simde_vdupq_n_u16(c)); #endif @@ -166,6 +230,13 @@ simde_uint32x4_t simde_vmlsq_n_u32(simde_uint32x4_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_u32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlsq_u32(a, b, simde_vdupq_n_u32(c)); #endif diff --git a/arm/neon/mlsl.h b/arm/neon/mlsl.h index e79cea157..6dae3de71 100644 --- a/arm/neon/mlsl.h +++ b/arm/neon/mlsl.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_H) @@ -39,6 +40,15 @@ simde_int16x8_t simde_vmlsl_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i16m1(a_.sv128 , __riscv_vwmul_vv_i16m1(vb , vc , 8) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmull_s8(b, c)); #endif @@ -53,6 +63,15 @@ simde_int32x4_t simde_vmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vv_i32m1(vb , vc , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_s16(b, c)); #endif @@ -67,6 +86,15 @@ simde_int64x2_t simde_vmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vv_i64m1(vb , vc , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_s32(b, c)); #endif @@ -81,6 +109,15 @@ simde_uint16x8_t simde_vmlsl_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u16m1(a_.sv128 , __riscv_vwmulu_vv_u16m1(vb , vc , 8) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmull_u8(b, c)); #endif @@ -95,6 +132,15 @@ simde_uint32x4_t simde_vmlsl_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vv_u32m1(vb , vc , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_u16(b, c)); #endif @@ -109,6 +155,15 @@ simde_uint64x2_t simde_vmlsl_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vv_u64m1(vb , vc , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_u32(b, c)); #endif diff --git a/arm/neon/mlsl_high.h b/arm/neon/mlsl_high.h index d70ca935d..4477064e5 100644 --- a/arm/neon/mlsl_high.h +++ b/arm/neon/mlsl_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_H) @@ -39,6 +40,17 @@ simde_int16x8_t simde_vmlsl_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + simde_int8x16_private c_ = simde_int8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_i8m1(c_.sv128 , 8 , 16); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv128); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i16m1(a_.sv128 , __riscv_vwmul_vv_i16m1(vb , vc , 8) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmull_high_s8(b, c)); #endif @@ -53,6 +65,17 @@ simde_int32x4_t simde_vmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + simde_int16x8_private c_ = simde_int16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_i16m1(c_.sv128 , 4 , 8); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vv_i32m1(vb , vc , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_high_s16(b, c)); #endif @@ -67,6 +90,17 @@ simde_int64x2_t simde_vmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + simde_int32x4_private c_ = simde_int32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_i32m1(c_.sv128 , 2, 4); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vv_i64m1(vb , vc , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_high_s32(b, c)); #endif @@ -81,6 +115,17 @@ simde_uint16x8_t simde_vmlsl_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + simde_uint8x16_private c_ = simde_uint8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_u8m1(c_.sv128 , 8 , 16); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u16m1(a_.sv128 , __riscv_vwmulu_vv_u16m1(vb , vc , 8) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmull_high_u8(b, c)); #endif @@ -95,6 +140,17 @@ simde_uint32x4_t simde_vmlsl_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + simde_uint16x8_private c_ = simde_uint16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_u16m1(c_.sv128 , 4 , 8); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vv_u32m1(vb , vc , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_high_u16(b, c)); #endif @@ -109,6 +165,17 @@ simde_uint64x2_t simde_vmlsl_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + simde_uint32x4_private c_ = simde_uint32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_u32m1(c_.sv128 , 2, 4); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vv_u64m1(vb , vc , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_high_u32(b, c)); #endif diff --git a/arm/neon/mlsl_high_lane.h b/arm/neon/mlsl_high_lane.h new file mode 100644 index 000000000..f45b7d989 --- /dev/null +++ b/arm/neon/mlsl_high_lane.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MLSL_HIGH_LANE_H + +#include "movl_high.h" +#include "mlsl_high.h" +#include "dup_n.h" +#include "mls.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlsl_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_s16(a, b, v, lane) vmlsl_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_s16 + #define vmlsl_high_lane_s16(a, b, v, lane) simde_vmlsl_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlsl_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_s16(a, b, v, lane) vmlsl_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_s16 + #define vmlsl_high_laneq_s16(a, b, v, lane) simde_vmlsl_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlsl_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_s32(a, b, v, lane) vmlsl_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_s32 + #define vmlsl_high_lane_s32(a, b, v, lane) simde_vmlsl_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlsl_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_s32(a, b, v, lane) vmlsl_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_s32 + #define vmlsl_high_laneq_s32(a, b, v, lane) simde_vmlsl_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlsl_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_u16(a, b, v, lane) vmlsl_high_lane_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_u16 + #define vmlsl_high_lane_u16(a, b, v, lane) simde_vmlsl_high_lane_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlsl_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_u16(a, b, v, lane) vmlsl_high_laneq_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_u16 + #define vmlsl_high_laneq_u16(a, b, v, lane) simde_vmlsl_high_laneq_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlsl_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_u32(a, b, v, lane) vmlsl_high_lane_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_u32 + #define vmlsl_high_lane_u32(a, b, v, lane) simde_vmlsl_high_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlsl_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_u32(a, b, v, lane) vmlsl_high_laneq_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_u32 + #define vmlsl_high_laneq_u32(a, b, v, lane) simde_vmlsl_high_laneq_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H) */ diff --git a/arm/neon/mlsl_high_n.h b/arm/neon/mlsl_high_n.h index 7be34c81b..be23c0079 100644 --- a/arm/neon/mlsl_high_n.h +++ b/arm/neon/mlsl_high_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_N_H) @@ -41,6 +42,14 @@ simde_int32x4_t simde_vmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vx_i32m1(vb , c , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlsq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,6 +64,14 @@ simde_int64x2_t simde_vmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vx_i64m1(vb , c , 2) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -84,6 +101,14 @@ simde_uint32x4_t simde_vmlsl_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vx_u32m1(vb , c , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlsq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +123,14 @@ simde_uint64x2_t simde_vmlsl_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vx_u64m1(vb , c , 2) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/arm/neon/mlsl_n.h b/arm/neon/mlsl_n.h index 68ee44bff..1ec4a36ce 100644 --- a/arm/neon/mlsl_n.h +++ b/arm/neon/mlsl_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_N_H) @@ -39,6 +40,13 @@ simde_int32x4_t simde_vmlsl_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vx_i32m1(vb , c , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_n_s16(b, c)); #endif @@ -53,6 +61,13 @@ simde_int64x2_t simde_vmlsl_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vx_i64m1(vb , c , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_n_s32(b, c)); #endif @@ -67,6 +82,13 @@ simde_uint32x4_t simde_vmlsl_n_u16(simde_uint32x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vx_u32m1(vb , c , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_n_u16(b, c)); #endif @@ -81,6 +103,13 @@ simde_uint64x2_t simde_vmlsl_n_u32(simde_uint64x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vx_u64m1(vb , c , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_n_u32(b, c)); #endif diff --git a/arm/neon/mmlaq.h b/arm/neon/mmlaq.h new file mode 100644 index 000000000..b55882ec4 --- /dev/null +++ b/arm/neon/mmlaq.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MMLAQ_H) +#define SIMDE_ARM_NEON_MMLAQ_H + +#include "types.h" +#include "cgt.h" +#include "bsl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmmlaq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { + // I8MM is optional feature. src: https://patchwork.ffmpeg.org/project/ffmpeg/patch/20230530123043.52940-2-martin@martin.st/ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vmmlaq_s32(r, a, b); + #else + simde_int8x16_private + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_int32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) + #undef vmmlaq_s32 + #define vmmlaq_s32(r, a, b) simde_vmmlaq_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmmlaq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vmmlaq_u32(r, a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + simde_uint32x4_private + r_ = simde_uint32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_uint32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) + #undef vmmlaq_u32 + #define vmmlaq_u32(r, a, b) simde_vmmlaq_u32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusmmlaq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusmmlaq_s32(r, a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(a); + simde_int8x16_private + b_ = simde_int8x16_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_int32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) + #undef vusmmlaq_s32 + #define vusmmlaq_s32(r, a, b) simde_vusmmlaq_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmmlaq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfmmlaq_f32(r, a, b); + #else + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + simde_float32x4_private + r_ = simde_float32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += simde_bfloat16_to_float32(a_.values[(k/2)*4+i]) * + simde_bfloat16_to_float32(b_.values[(k%2)*4+i]); + } + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8) && \ + defined(SIMDE_ARM_NEON_BF16))) + #undef vbfmmlaq_f32 + #define vbfmmlaq_f32(r, a, b) simde_vbfmmlaq_f32((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MMLAQ_H) */ diff --git a/arm/neon/movl.h b/arm/neon/movl.h index 853e3249e..91b2db9b9 100644 --- a/arm/neon/movl.h +++ b/arm/neon/movl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MOVL_H) @@ -50,7 +51,10 @@ simde_vmovl_s8(simde_int8x8_t a) { simde_int16x8_private r_; simde_int8x8_private a_ = simde_int8x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t va = __riscv_vlmul_trunc_v_i8m1_i8mf2 (a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i16m1 (va, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -83,7 +87,10 @@ simde_vmovl_s16(simde_int16x4_t a) { simde_int32x4_private r_; simde_int16x4_private a_ = simde_int16x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t va = __riscv_vlmul_trunc_v_i16m1_i16mf2 (a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i32m1 (va, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -116,7 +123,10 @@ simde_vmovl_s32(simde_int32x2_t a) { simde_int64x2_private r_; simde_int32x2_private a_ = simde_int32x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t va = __riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i64m1 (va, 2); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -149,7 +159,10 @@ simde_vmovl_u8(simde_uint8x8_t a) { simde_uint16x8_private r_; simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t va = __riscv_vlmul_trunc_v_u8m1_u8mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u16m1 (va, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -182,7 +195,10 @@ simde_vmovl_u16(simde_uint16x4_t a) { simde_uint32x4_private r_; simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t va = __riscv_vlmul_trunc_v_u16m1_u16mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u32m1 (va, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -215,7 +231,10 @@ simde_vmovl_u32(simde_uint32x2_t a) { simde_uint64x2_private r_; simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t va = __riscv_vlmul_trunc_v_u32m1_u32mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u64m1 (va, 2); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/movn.h b/arm/neon/movn.h index aa3ca453d..cd54f25ef 100644 --- a/arm/neon/movn.h +++ b/arm/neon/movn.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MOVN_H) @@ -42,7 +43,9 @@ simde_vmovn_s16(simde_int16x8_t a) { simde_int8x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i8mf2_i8m1(__riscv_vncvt_x_x_w_i8mf2(a_.sv128, 8)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -68,7 +71,9 @@ simde_vmovn_s32(simde_int32x4_t a) { simde_int16x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vncvt_x_x_w_i16mf2(a_.sv128, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -94,7 +99,9 @@ simde_vmovn_s64(simde_int64x2_t a) { simde_int32x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vncvt_x_x_w_i32mf2(a_.sv128, 2)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -120,7 +127,9 @@ simde_vmovn_u16(simde_uint16x8_t a) { simde_uint8x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u8mf2_u8m1(__riscv_vncvt_x_x_w_u8mf2(a_.sv128, 8)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -146,7 +155,9 @@ simde_vmovn_u32(simde_uint32x4_t a) { simde_uint16x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u16mf2_u16m1(__riscv_vncvt_x_x_w_u16mf2(a_.sv128, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -172,7 +183,9 @@ simde_vmovn_u64(simde_uint64x2_t a) { simde_uint32x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u32mf2_u32m1(__riscv_vncvt_x_x_w_u32mf2(a_.sv128, 2)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE diff --git a/arm/neon/mul.h b/arm/neon/mul.h index 48de8a240..ce2b87c96 100644 --- a/arm/neon/mul.h +++ b/arm/neon/mul.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MUL_H) @@ -36,6 +38,49 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulh_f16(a, b); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return simde_float16_from_float32(a_ * b_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulh_f16 + #define vmulh_f16(a, b) simde_vmulh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmul_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmul_f16 + #define vmul_f16(a, b) simde_vmul_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -47,7 +92,9 @@ simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -75,7 +122,9 @@ simde_vmul_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -103,7 +152,9 @@ simde_vmul_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -133,6 +184,8 @@ simde_vmul_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _m_pmullw(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else @@ -161,7 +214,9 @@ simde_vmul_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -186,7 +241,9 @@ simde_x_vmul_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -209,7 +266,9 @@ simde_vmul_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -237,7 +296,9 @@ simde_vmul_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -265,7 +326,9 @@ simde_vmul_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -290,7 +353,9 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -302,6 +367,32 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) { return simde_uint64x1_from_private(r_); } +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulq_f16 + #define vmulq_f16(a, b) simde_vmulq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -317,6 +408,8 @@ simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_mul_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -349,6 +442,8 @@ simde_vmulq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_mul_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -400,6 +495,8 @@ simde_vmulq_s8(simde_int8x16_t a, simde_int8x16_t b) { ) #endif ); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -430,6 +527,8 @@ simde_vmulq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_mullo_epi16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -460,6 +559,8 @@ simde_vmulq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -489,6 +590,8 @@ simde_x_vmulq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.v128 = wasm_i64x2_mul(a_.v128, b_.v128); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) r_.m128i = _mm_mullo_epi64(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -506,6 +609,13 @@ simde_uint8x16_t simde_vmulq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + r_.sv128 = __riscv_vmul_vv_u8m1(a_.sv128, b_.sv128, 16); + return simde_uint8x16_from_private(r_); #else return simde_vreinterpretq_u8_s8( @@ -526,6 +636,13 @@ simde_uint16x8_t simde_vmulq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + r_.sv128 = __riscv_vmul_vv_u16m1(a_.sv128, b_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vreinterpretq_u16_s16( @@ -546,6 +663,13 @@ simde_uint32x4_t simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + r_.sv128 = __riscv_vmul_vv_u32m1(a_.sv128, b_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vreinterpretq_u32_s32( @@ -564,14 +688,85 @@ simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_x_vmulq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { - return - simde_vreinterpretq_u64_s64( - simde_x_vmulq_s64( - simde_vreinterpretq_s64_u64(a), - simde_vreinterpretq_s64_u64(b) - ) - ); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + r_.sv128 = __riscv_vmul_vv_u64m1(a_.sv128, b_.sv128, 2); + return simde_uint64x2_from_private(r_); + #else + return + simde_vreinterpretq_u64_s64( + simde_x_vmulq_s64( + simde_vreinterpretq_s64_u64(a), + simde_vreinterpretq_s64_u64(b) + ) + ); + #endif +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vmul_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmul_p8(a, b); + #else + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)), + b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b)); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(uint16_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF))); + } + + return simde_vreinterpret_p8_u8(simde_uint8x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmul_p8 + #define vmul_p8(a, b) simde_vmul_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vmulq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmulq_p8(a, b); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)), + b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b)); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(uint16_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF))); + } + + return simde_vreinterpretq_p8_u8(simde_uint8x16_from_private(r_)); + #endif } +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmulq_p8 + #define vmulq_p8(a, b) simde_vmulq_p8((a), (b)) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/mul_lane.h b/arm/neon/mul_lane.h index f7b1f2e51..540b57524 100644 --- a/arm/neon/mul_lane.h +++ b/arm/neon/mul_lane.h @@ -22,17 +22,39 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su */ #if !defined(SIMDE_ARM_NEON_MUL_LANE_H) #define SIMDE_ARM_NEON_MUL_LANE_H #include "types.h" +#include "mul.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmulh_f16(a, simde_float16x4_to_private(b).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vmulh_lane_f16(a, b, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_lane_f16(a, b, lane)) + #else + #define simde_vmulh_lane_f16(a, b, lane) vmulh_lane_f16((a), (b), (lane)) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulh_lane_f16 + #define vmulh_lane_f16(a, b, lane) simde_vmulh_lane_f16(a, b, lane) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float64_t simde_vmuld_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane) @@ -90,6 +112,25 @@ simde_vmuls_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane) #define vmuls_lane_f32(a, b, lane) simde_vmuls_lane_f32(a, b, lane) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmulh_f16(a, simde_float16x8_to_private(b).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vmulh_laneq_f16(a, b, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_laneq_f16(a, b, lane)) + #else + #define simde_vmulh_laneq_f16(a, b, lane) vmulh_laneq_f16((a), (b), (lane)) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulh_laneq_f16 + #define vmulh_laneq_f16(a, b, lane) simde_vmulh_laneq_f16(a, b, lane) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) @@ -109,6 +150,30 @@ simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) #define vmuls_laneq_f32(a, b, lane) simde_vmuls_laneq_f32(a, b, lane) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmul_lane_f16(a, b, lane) vmul_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmul_lane_f16 + #define vmul_lane_f16(a, b, lane) simde_vmul_lane_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) @@ -118,10 +183,14 @@ simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -142,10 +211,14 @@ simde_vmul_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } @@ -166,10 +239,14 @@ simde_vmul_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -190,10 +267,14 @@ simde_vmul_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x2_from_private(r_); } @@ -214,10 +295,14 @@ simde_vmul_lane_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -238,10 +323,14 @@ simde_vmul_lane_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -263,10 +352,14 @@ simde_vmul_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane) simde_int16x8_private b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -288,10 +381,14 @@ simde_vmul_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) simde_int32x4_private b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[lane]))); + } + #endif return simde_int32x2_from_private(r_); } @@ -313,10 +410,14 @@ simde_vmul_laneq_u16(simde_uint16x4_t a, simde_uint16x8_t b, const int lane) simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -338,10 +439,14 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane) simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -353,6 +458,30 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane) #define vmul_laneq_u32(a, b, lane) simde_vmul_laneq_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulq_lane_f16(a, b, lane) vmulq_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulq_lane_f16 + #define vmulq_lane_f16(a, b, lane) simde_vmulq_lane_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) @@ -362,10 +491,14 @@ simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x4_to_private(a); simde_float32x2_private b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -386,10 +519,14 @@ simde_vmulq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x2_to_private(a); simde_float64x1_private b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -410,10 +547,14 @@ simde_vmulq_lane_s16(simde_int16x8_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x8_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -434,10 +575,14 @@ simde_vmulq_lane_s32(simde_int32x4_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x4_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x4_from_private(r_); } @@ -458,10 +603,14 @@ simde_vmulq_lane_u16(simde_uint16x8_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x8_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -482,10 +631,14 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x4_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -497,6 +650,30 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane) #define vmulq_lane_u32(a, b, lane) simde_vmulq_lane_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulq_laneq_f16(a, b, lane) vmulq_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulq_laneq_f16 + #define vmulq_laneq_f16(a, b, lane) simde_vmulq_laneq_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) @@ -506,10 +683,14 @@ simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -530,10 +711,14 @@ simde_vmulq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -554,10 +739,14 @@ simde_vmulq_laneq_s16(simde_int16x8_t a, simde_int16x8_t b, const int lane) a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -578,10 +767,14 @@ simde_vmulq_laneq_s32(simde_int32x4_t a, simde_int32x4_t b, const int lane) a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[lane]))); + } + #endif return simde_int32x4_from_private(r_); } @@ -602,10 +795,14 @@ simde_vmulq_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int lane) a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -626,10 +823,14 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane) a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -641,6 +842,30 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane) #define vmulq_laneq_u32(a, b, lane) simde_vmulq_laneq_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmul_laneq_f16(a, b, lane) vmul_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmul_laneq_f16 + #define vmul_laneq_f16(a, b, lane) simde_vmul_laneq_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) @@ -650,10 +875,14 @@ simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x2_to_private(a); simde_float32x4_private b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -674,10 +903,14 @@ simde_vmul_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x1_to_private(a); simde_float64x2_private b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } diff --git a/arm/neon/mul_n.h b/arm/neon/mul_n.h index 5c73ad2e7..09b0cd611 100644 --- a/arm/neon/mul_n.h +++ b/arm/neon/mul_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MUL_N_H) @@ -36,6 +37,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_n_f16(simde_float16x4_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmul_n_f16(a, b); + #else + return simde_vmul_f16(a, simde_vdup_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmul_n_f16 + #define vmul_n_f16(a, b) simde_vmul_n_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_n_f32(simde_float32x2_t a, simde_float32 b) { @@ -120,6 +135,20 @@ simde_vmul_n_u32(simde_uint32x2_t a, uint32_t b) { #define vmul_n_u32(a, b) simde_vmul_n_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_n_f16(simde_float16x8_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulq_n_f16(a, b); + #else + return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulq_n_f16 + #define vmulq_n_f16(a, b) simde_vmulq_n_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_n_f32(simde_float32x4_t a, simde_float32 b) { diff --git a/arm/neon/mull.h b/arm/neon/mull.h index bfad62a2f..b88856b17 100644 --- a/arm/neon/mull.h +++ b/arm/neon/mull.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MULL_H) @@ -230,6 +231,62 @@ simde_vmull_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vmull_u32(a, b) simde_vmull_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vmull_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmull_p8(a, b); + #else + simde_uint8x8_private + a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)), + b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b)); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(size_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = result; + } + + return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmull_p8 + #define vmull_p8(a, b) simde_vmull_p8((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vmull_p64(simde_poly64_t a, simde_poly64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vmull_p64(a, b); + #else + simde_poly128_t extend_op2 = HEDLEY_STATIC_CAST(simde_poly128_t, b); + simde_poly128_t result = 0; + SIMDE_VECTORIZE + for(size_t j = 0; j < 64; ++j) { + if (a & (1ull << j)) { + result = result ^ (extend_op2 << j); + } + } + return result; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_CRYPTO)) + #undef vmull_p64 + #define vmull_p64(a, b) simde_vmull_p64((a), (b)) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/mull_high.h b/arm/neon/mull_high.h index 658d151f7..b5962c221 100644 --- a/arm/neon/mull_high.h +++ b/arm/neon/mull_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MULL_HIGH_H) @@ -30,6 +31,7 @@ #include "types.h" #include "mul.h" #include "movl_high.h" +#include "mull.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -119,6 +121,57 @@ simde_vmull_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vmull_high_u32(a, b) simde_vmull_high_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vmull_high_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_p8(a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)), + b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b)); + simde_uint16x8_private r_; + + size_t high_offset = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i+high_offset]); + uint16_t result = 0; + for(size_t j = 0; j < 8; ++j) { + if (a_.values[i+high_offset] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = result; + } + + return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_p8 + #define vmull_high_p8(a, b) simde_vmull_high_p8((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vmull_high_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vmull_high_p64(a, b); + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + return simde_vmull_p64(a_.values[1], b_.values[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_CRYPTO)) + #undef vmull_high_p64 + #define vmull_high_p64(a, b) simde_vmull_high_p64((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/mull_high_lane.h b/arm/neon/mull_high_lane.h new file mode 100644 index 000000000..226dbf862 --- /dev/null +++ b/arm/neon/mull_high_lane.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MULL_HIGH_LANE_H + +#include "combine.h" +#include "mull.h" +#include "dup_n.h" +#include "get_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_s16(a, v, lane) vmull_high_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_s16 + #define vmull_high_lane_s16(a, v, lane) simde_vmull_high_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_s16(a, v, lane) vmull_high_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_s16 + #define vmull_high_laneq_s16(a, v, lane) simde_vmull_high_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_s32(a, v, lane) vmull_high_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_s32 + #define vmull_high_lane_s32(a, v, lane) simde_vmull_high_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_s32(a, v, lane) vmull_high_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_s32 + #define vmull_high_laneq_s32(a, v, lane) simde_vmull_high_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_lane_u16(simde_uint16x8_t a, simde_uint16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4_private + v_ = simde_uint16x4_to_private(v); + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_u16(a, v, lane) vmull_high_lane_u16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_u16 + #define vmull_high_lane_u16(a, v, lane) simde_vmull_high_lane_u16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8_private + v_ = simde_uint16x8_to_private(v); + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_u16(a, v, lane) vmull_high_laneq_u16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_u16 + #define vmull_high_laneq_u16(a, v, lane) simde_vmull_high_laneq_u16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_lane_u32(simde_uint32x4_t a, simde_uint32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2_private + v_ = simde_uint32x2_to_private(v); + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_u32(a, v, lane) vmull_high_lane_u32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_u32 + #define vmull_high_lane_u32(a, v, lane) simde_vmull_high_lane_u32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4_private + v_ = simde_uint32x4_to_private(v); + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_u32(a, v, lane) vmull_high_laneq_u32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_u32 + #define vmull_high_laneq_u32(a, v, lane) simde_vmull_high_laneq_u32((a), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */ diff --git a/arm/neon/mull_high_n.h b/arm/neon/mull_high_n.h new file mode 100644 index 000000000..d6a5b356f --- /dev/null +++ b/arm/neon/mull_high_n.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H) +#define SIMDE_ARM_NEON_MULL_HIGH_N_H + +#include "combine.h" +#include "get_high.h" +#include "dup_n.h" +#include "mull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_n_s16(simde_int16x8_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_s16(a, b); + #else + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_s16 + #define vmull_high_n_s16(a, b) simde_vmull_high_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_n_s32(simde_int32x4_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_s32(a, b); + #else + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_s32 + #define vmull_high_n_s32(a, b) simde_vmull_high_n_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_n_u16(simde_uint16x8_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_u16(a, b); + #else + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_u16 + #define vmull_high_n_u16(a, b) simde_vmull_high_n_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_n_u32(simde_uint32x4_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_u32(a, b); + #else + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_u32 + #define vmull_high_n_u32(a, b) simde_vmull_high_n_u32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H) */ diff --git a/arm/neon/mulx.h b/arm/neon/mulx.h new file mode 100644 index 000000000..a2a4e8ad0 --- /dev/null +++ b/arm/neon/mulx.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_H) +#define SIMDE_ARM_NEON_MULX_H + +#include "types.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxh_f16(a, b); + #else + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxh_f16 + #define vmulxh_f16(a, b) simde_vmulxh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxs_f32(a, b); + #else + return a * b; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxs_f32 + #define vmulxs_f32(a, b) simde_vmulxs_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxd_f64(a, b); + #else + return a * b; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxd_f64 + #define vmulxd_f64(a, b) simde_vmulxd_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulx_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulx_f16 + #define vmulx_f16(a, b) simde_vmulx_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulx_f32(a, b); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values * b_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + #endif + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_f32 + #define vmulx_f32(a, b) simde_vmulx_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulx_f64(a, b); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values * b_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + #endif + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_f64 + #define vmulx_f64(a, b) simde_vmulx_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxq_f16 + #define vmulxq_f16(a, b) simde_vmulxq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxq_f32(a, b); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_f32 + #define vmulxq_f32(a, b) simde_vmulxq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxq_f64(a, b); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_f64 + #define vmulxq_f64(a, b) simde_vmulxq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_H) */ diff --git a/arm/neon/mulx_lane.h b/arm/neon/mulx_lane.h new file mode 100644 index 000000000..4ab5ed8d3 --- /dev/null +++ b/arm/neon/mulx_lane.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_LANE_H) +#define SIMDE_ARM_NEON_MULX_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(simde_float16x4_to_private(b).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxh_lane_f16(a, b, lane) vmulxh_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxh_lane_f16 + #define vmulxh_lane_f16(a, b, lane) simde_vmulxh_lane_f16(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return a * simde_float32x2_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxs_lane_f32(a, b, lane) vmulxs_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxs_lane_f32 + #define vmulxs_lane_f32(a, b, lane) simde_vmulxs_lane_f32(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return a * simde_float64x1_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxd_lane_f64(a, b, lane) vmulxd_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxd_lane_f64 + #define vmulxd_lane_f64(a, b, lane) simde_vmulxd_lane_f64(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(simde_float16x8_to_private(b).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxh_laneq_f16(a, b, lane) vmulxh_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxh_laneq_f16 + #define vmulxh_laneq_f16(a, b, lane) simde_vmulxh_laneq_f16(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return a * simde_float32x4_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxs_laneq_f32(a, b, lane) vmulxs_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxs_laneq_f32 + #define vmulxs_laneq_f32(a, b, lane) simde_vmulxs_laneq_f32(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_laneq_f64(simde_float64_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return a * simde_float64x2_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxd_laneq_f64(a, b, lane) vmulxd_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxd_laneq_f64 + #define vmulxd_laneq_f64(a, b, lane) simde_vmulxd_laneq_f64(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulx_lane_f16(a, b, lane) vmulx_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulx_lane_f16 + #define vmulx_lane_f16(a, b, lane) simde_vmulx_lane_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_lane_f32(a, b, lane) vmulx_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_lane_f32 + #define vmulx_lane_f32(a, b, lane) simde_vmulx_lane_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_lane_f64(a, b, lane) vmulx_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_lane_f64 + #define vmulx_lane_f64(a, b, lane) simde_vmulx_lane_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxq_lane_f16(a, b, lane) vmulxq_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxq_lane_f16 + #define vmulxq_lane_f16(a, b, lane) simde_vmulxq_lane_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + simde_float32x2_private b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_lane_f32(a, b, lane) vmulxq_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_lane_f32 + #define vmulxq_lane_f32(a, b, lane) simde_vmulxq_lane_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + simde_float64x1_private b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_lane_f64(a, b, lane) vmulxq_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_lane_f64 + #define vmulxq_lane_f64(a, b, lane) simde_vmulxq_lane_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxq_laneq_f16(a, b, lane) vmulxq_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxq_laneq_f16 + #define vmulxq_laneq_f16(a, b, lane) simde_vmulxq_laneq_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_laneq_f32(a, b, lane) vmulxq_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_laneq_f32 + #define vmulxq_laneq_f32(a, b, lane) simde_vmulxq_laneq_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_laneq_f64(a, b, lane) vmulxq_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_laneq_f64 + #define vmulxq_laneq_f64(a, b, lane) simde_vmulxq_laneq_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulx_laneq_f16(a, b, lane) vmulx_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulx_laneq_f16 + #define vmulx_laneq_f16(a, b, lane) simde_vmulx_laneq_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + simde_float32x4_private b_ = simde_float32x4_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_laneq_f32(a, b, lane) vmulx_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_laneq_f32 + #define vmulx_laneq_f32(a, b, lane) simde_vmulx_laneq_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + simde_float64x2_private b_ = simde_float64x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_laneq_f64(a, b, lane) vmulx_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_laneq_f64 + #define vmulx_laneq_f64(a, b, lane) simde_vmulx_laneq_f64((a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_LANE_H) */ diff --git a/arm/neon/mulx_n.h b/arm/neon/mulx_n.h new file mode 100644 index 000000000..efb10f644 --- /dev/null +++ b/arm/neon/mulx_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_N_H) +#define SIMDE_ARM_NEON_MULX_N_H + +#include "types.h" +#include "mul.h" +#include "dup_n.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_n_f16(simde_float16x4_t a, simde_float16 b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulx_n_f16(a, b); + #else + return simde_vmul_f16(a, simde_vdup_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulx_n_f16 + #define vmulx_n_f16(a, b) simde_vmulx_n_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_n_f16(simde_float16x8_t a, simde_float16 b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxq_n_f16(a, b); + #else + return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vmulxq_n_f16 + #define vmulxq_n_f16(a, b) simde_vmulxq_n_f16((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_N_H) */ diff --git a/arm/neon/mvn.h b/arm/neon/mvn.h index 654455ec2..7ded6b502 100644 --- a/arm/neon/mvn.h +++ b/arm/neon/mvn.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MVN_H) @@ -56,6 +57,8 @@ simde_vmvnq_s8(simde_int8x16_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -91,6 +94,8 @@ simde_vmvnq_s16(simde_int16x8_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -126,6 +131,8 @@ simde_vmvnq_s32(simde_int32x4_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -161,6 +168,8 @@ simde_vmvnq_u8(simde_uint8x16_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -196,6 +205,8 @@ simde_vmvnq_u16(simde_uint16x8_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -231,6 +242,8 @@ simde_vmvnq_u32(simde_uint32x4_t a) { r_.v128 = wasm_v128_not(a_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -260,6 +273,8 @@ simde_vmvn_s8(simde_int8x8_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +304,8 @@ simde_vmvn_s16(simde_int16x4_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -318,6 +335,8 @@ simde_vmvn_s32(simde_int32x2_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -347,6 +366,8 @@ simde_vmvn_u8(simde_uint8x8_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -376,6 +397,8 @@ simde_vmvn_u16(simde_uint16x4_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -405,6 +428,8 @@ simde_vmvn_u32(simde_uint32x2_t a) { r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -420,6 +445,52 @@ simde_vmvn_u32(simde_uint32x2_t a) { #define vmvn_u32(a) simde_vmvn_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vmvn_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmvn_p8(a); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ~(a_.values[i]); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmvn_p8 + #define vmvn_p8(a) simde_vmvn_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vmvnq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmvnq_p8(a); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ~(a_.values[i]); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmvnq_p8 + #define vmvnq_p8(a) simde_vmvnq_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/neg.h b/arm/neon/neg.h index 779238950..2c4c4bb04 100644 --- a/arm/neon/neg.h +++ b/arm/neon/neg.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_NEG_H) @@ -47,6 +48,43 @@ simde_vnegd_s64(int64_t a) { #define vnegd_s64(a) simde_vnegd_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vnegh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vnegh_f16(a); + #else + return simde_float16_from_float32(-simde_float16_to_float32(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vnegh_f16 + #define vnegh_f16(a) simde_vnegh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vneg_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vneg_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vnegh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vneg_f16 + #define vneg_f16(a) simde_vneg_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vneg_f32(simde_float32x2_t a) { @@ -209,6 +247,29 @@ simde_vneg_s64(simde_int64x1_t a) { #define vneg_s64(a) simde_vneg_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vnegq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vnegq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vnegh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vnegq_f16 + #define vnegq_f16(a) simde_vnegq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vnegq_f32(simde_float32x4_t a) { diff --git a/arm/neon/padd.h b/arm/neon/padd.h index 6cfd99a2d..11186a1d7 100644 --- a/arm/neon/padd.h +++ b/arm/neon/padd.h @@ -23,6 +23,7 @@ * Copyright: * 2020-2021 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PADD_H) @@ -96,6 +97,20 @@ simde_vpadds_f32(simde_float32x2_t a) { #define vpadds_f32(a) simde_vpadds_f32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpadd_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) && defined(SIMDE_ARM_NEON_FP16) + return vpadd_f16(a, b); + #else + return simde_vadd_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) && defined(SIMDE_ARM_NEON_FP16))) + #undef vpadd_f16 + #define vpadd_f16(a, b) simde_vpadd_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpadd_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -105,7 +120,7 @@ simde_vpadd_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_vadd_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0))) #undef vpadd_f32 #define vpadd_f32(a, b) simde_vpadd_f32((a), (b)) #endif @@ -198,6 +213,20 @@ simde_vpadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpadd_u32(a, b) simde_vpadd_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpaddq_f16(a, b); + #else + return simde_vaddq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpaddq_f16 + #define vpaddq_f16(a, b) simde_vpaddq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpaddq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/arm/neon/paddl.h b/arm/neon/paddl.h index 203fbad9f..3b36e0dcd 100644 --- a/arm/neon/paddl.h +++ b/arm/neon/paddl.h @@ -286,7 +286,7 @@ simde_vpaddlq_u16(simde_uint16x8_t a) { simde_uint32x4_private r_; #if defined(SIMDE_X86_XOP_NATIVE) - r_.sse_m128i = _mm_haddd_epu16(a_.sse_m128i); + r_.m128i = _mm_haddd_epu16(a_.m128i); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_add_epi32( diff --git a/arm/neon/pmax.h b/arm/neon/pmax.h index ecf31a1a9..249998212 100644 --- a/arm/neon/pmax.h +++ b/arm/neon/pmax.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PMAX_H) @@ -67,6 +68,20 @@ simde_vpmaxqd_f64(simde_float64x2_t a) { #define vpmaxqd_f64(a) simde_vpmaxqd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmax_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmax_f16(a, b); + #else + return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpmax_f16 + #define vpmax_f16(a, b) simde_vpmax_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpmax_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -165,6 +180,20 @@ simde_vpmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpmax_u32(a, b) simde_vpmax_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxq_f16(a, b); + #else + return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpmaxq_f16 + #define vpmaxq_f16(a, b) simde_vpmaxq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -272,7 +301,7 @@ simde_vpmaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { return simde_vmaxq_u32(simde_vuzp1q_u32(a, b), simde_vuzp2q_u32(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vpmaxq_u32 #define vpmaxq_u32(a, b) simde_vpmaxq_u32((a), (b)) #endif diff --git a/arm/neon/pmaxnm.h b/arm/neon/pmaxnm.h new file mode 100644 index 000000000..b85f56e0a --- /dev/null +++ b/arm/neon/pmaxnm.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_PMAXNM_H) +#define SIMDE_ARM_NEON_PMAXNM_H + +#include "types.h" +#include "max.h" +#include "uzp1.h" +#include "uzp2.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vpmaxnms_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnms_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnms_f32 + #define vpmaxnms_f32(a) simde_vpmaxnms_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vpmaxnmqd_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmqd_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmqd_f64 + #define vpmaxnmqd_f64(a) simde_vpmaxnmqd_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxnm_f16(a, b); + #else + return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpmaxnm_f16 + #define vpmaxnm_f16(a, b) simde_vpmaxnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vpmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnm_f32(a, b); + #else + return simde_vmax_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnm_f32 + #define vpmaxnm_f32(a, b) simde_vpmaxnm_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxnmq_f16(a, b); + #else + return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpmaxnmq_f16 + #define vpmaxnmq_f16(a, b) simde_vpmaxnmq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vpmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmq_f32(a, b); + #else + return simde_vmaxq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmq_f32 + #define vpmaxnmq_f32(a, b) simde_vpmaxnmq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vpmaxnmq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmq_f64(a, b); + #else + return simde_vmaxq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmq_f64 + #define vpmaxnmq_f64(a, b) simde_vpmaxnmq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_PMAXNM_H) */ diff --git a/arm/neon/pmin.h b/arm/neon/pmin.h index eaf58e455..640ca5677 100644 --- a/arm/neon/pmin.h +++ b/arm/neon/pmin.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PMIN_H) @@ -66,6 +67,20 @@ simde_vpminqd_f64(simde_float64x2_t a) { #define vpminqd_f64(a) simde_vpminqd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmin_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmin_f16(a, b); + #else + return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpmin_f16 + #define vpmin_f16(a, b) simde_vpmin_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpmin_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -164,6 +179,20 @@ simde_vpmin_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpmin_u32(a, b) simde_vpmin_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpminq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminq_f16(a, b); + #else + return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpminq_f16 + #define vpminq_f16(a, b) simde_vpminq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpminq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/arm/neon/pminnm.h b/arm/neon/pminnm.h new file mode 100644 index 000000000..36eae0f15 --- /dev/null +++ b/arm/neon/pminnm.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_PMINNM_H) +#define SIMDE_ARM_NEON_PMINNM_H + +#include "types.h" +#include "min.h" +#include "uzp1.h" +#include "uzp2.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vpminnms_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnms_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnms_f32 + #define vpminnms_f32(a) simde_vpminnms_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vpminnmqd_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmqd_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmqd_f64 + #define vpminnmqd_f64(a) simde_vpminnmqd_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminnm_f16(a, b); + #else + return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpminnm_f16 + #define vpminnm_f16(a, b) simde_vpminnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vpminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnm_f32(a, b); + #else + return simde_vmin_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnm_f32 + #define vpminnm_f32(a, b) simde_vpminnm_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminnmq_f16(a, b); + #else + return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vpminnmq_f16 + #define vpminnmq_f16(a, b) simde_vpminnmq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vpminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmq_f32(a, b); + #else + return simde_vminq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmq_f32 + #define vpminnmq_f32(a, b) simde_vpminnmq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vpminnmq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmq_f64(a, b); + #else + return simde_vminq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmq_f64 + #define vpminnmq_f64(a, b) simde_vpminnmq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_PMINNM_H) */ diff --git a/arm/neon/qabs.h b/arm/neon/qabs.h index 6e956f1e1..9ad7d7c83 100644 --- a/arm/neon/qabs.h +++ b/arm/neon/qabs.h @@ -162,7 +162,7 @@ simde_int8x16_t simde_vqabsq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqabsq_s8(a); - #elif defined(SIMDE_X86_SSE4_1_NATIVE) + #elif defined(SIMDE_X86_SSE2_NATIVE) simde_int8x16_private r_, a_ = simde_int8x16_to_private(simde_vabsq_s8(a)); diff --git a/arm/neon/qadd.h b/arm/neon/qadd.h index a577e2399..e5cec8f88 100644 --- a/arm/neon/qadd.h +++ b/arm/neon/qadd.h @@ -144,6 +144,8 @@ simde_vqadd_s8(simde_int8x8_t a, simde_int8x8_t b) { uint8_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsadd_vv_i8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -181,6 +183,8 @@ simde_vqadd_s16(simde_int16x4_t a, simde_int16x4_t b) { uint16_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsadd_vv_i16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -216,6 +220,8 @@ simde_vqadd_s32(simde_int32x2_t a, simde_int32x2_t b) { uint32_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsadd_vv_i32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -251,6 +257,8 @@ simde_vqadd_s64(simde_int64x1_t a, simde_int64x1_t b) { uint64_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsadd_vv_i64m1(a_.sv64, b_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -282,6 +290,8 @@ simde_vqadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #elif defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsaddu_vv_u8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -313,6 +323,8 @@ simde_vqadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #elif defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsaddu_vv_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -342,6 +354,8 @@ simde_vqadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsaddu_vv_u32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -371,6 +385,8 @@ simde_vqadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_VECTOR_SUBSCRIPT) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsaddu_vv_u64m1(a_.sv64, b_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -412,6 +428,8 @@ simde_vqaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { uint8_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsadd_vv_i8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -453,6 +471,8 @@ simde_vqaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { uint16_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsadd_vv_i16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -530,6 +550,8 @@ simde_vqaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { uint32_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsadd_vv_i32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -596,6 +618,8 @@ simde_vqaddq_s64(simde_int64x2_t a, simde_int64x2_t b) { uint64_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0); r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsadd_vv_i64m1(a_.sv128, b_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -631,6 +655,8 @@ simde_vqaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsaddu_vv_u8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -666,6 +692,8 @@ simde_vqaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsaddu_vv_u16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -716,6 +744,8 @@ simde_vqaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsaddu_vv_u32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -745,6 +775,8 @@ simde_vqaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #if defined(SIMDE_VECTOR_SUBSCRIPT) r_.values = a_.values + b_.values; r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsaddu_vv_u64m1(a_.sv128, b_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/qdmlal.h b/arm/neon/qdmlal.h new file mode 100644 index 000000000..121d7eed8 --- /dev/null +++ b/arm/neon/qdmlal.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_H) +#define SIMDE_ARM_NEON_QDMLAL_H + +#include "types.h" +#include "qadd.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlalh_s16(a, b, c); + #else + return simde_vqadds_s32(a, simde_vqdmullh_s16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_s16 + #define vqdmlalh_s16(a, b, c) simde_vqdmlalh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlals_s32(a, b, c); + #else + return simde_vqaddd_s64(a, simde_vqdmulls_s32(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_s32 + #define vqdmlals_s32(a, b, c) simde_vqdmlals_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_s16(a, b, c); + #else + return simde_vqaddq_s32(simde_vqdmull_s16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_s16 + #define vqdmlal_s16(a, b, c) simde_vqdmlal_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_s32(a, b, c); + #else + return simde_vqaddq_s64(simde_vqdmull_s32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_s32 + #define vqdmlal_s32(a, b, c) simde_vqdmlal_s32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_H) */ diff --git a/arm/neon/qdmlal_high.h b/arm/neon/qdmlal_high.h new file mode 100644 index 000000000..44edc9d06 --- /dev/null +++ b/arm/neon/qdmlal_high.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_H + +#include "types.h" +#include "qadd.h" +#include "qdmull_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_s16(a, b, c); + #else + return simde_vqaddq_s32(simde_vqdmull_high_s16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_s16 + #define vqdmlal_high_s16(a, b, c) simde_vqdmlal_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_s32(a, b, c); + #else + return simde_vqaddq_s64(simde_vqdmull_high_s32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_s32 + #define vqdmlal_high_s32(a, b, c) simde_vqdmlal_high_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) */ diff --git a/arm/neon/qdmlal_high_lane.h b/arm/neon/qdmlal_high_lane.h new file mode 100644 index 000000000..279cf5880 --- /dev/null +++ b/arm/neon/qdmlal_high_lane.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H + +#include "dup_lane.h" +#include "get_high.h" +#include "types.h" +#include "qdmlal.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_lane_s16(a, b, v, lane) vqdmlal_high_lane_s16(a, b, v, lane) +#else + #define simde_vqdmlal_high_lane_s16(a, b, v, lane) simde_vqdmlal_s16((a), simde_vget_high_s16((b)), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_lane_s16 + #define vqdmlal_high_lane_s16(a, b, c, lane) simde_vqdmlal_high_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) vqdmlal_high_laneq_s16(a, b, v, lane) +#else + #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_s16((a), simde_vget_high_s16((b)), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_laneq_s16 + #define vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_high_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_lane_s32(a, b, v, lane) vqdmlal_high_lane_s32(a, b, v, lane) +#else + #define simde_vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_s32((a), simde_vget_high_s32((b)), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_lane_s32 + #define vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_high_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) vqdmlal_high_laneq_s32(a, b, v, lane) +#else + #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) simde_vqdmlal_s32((a), simde_vget_high_s32((b)), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_laneq_s32 + #define vqdmlal_high_laneq_s32(a, b, v, lane) simde_vqdmlal_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) */ diff --git a/arm/neon/qdmlal_high_n.h b/arm/neon/qdmlal_high_n.h new file mode 100644 index 000000000..4e9c32203 --- /dev/null +++ b/arm/neon/qdmlal_high_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_N_H + +#include "dup_n.h" +#include "types.h" +#include "qdmlal_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_n_s16(a, b, c); + #else + return simde_vqdmlal_high_s16(a, b, simde_vdupq_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_n_s16 + #define vqdmlal_high_n_s16(a, b, c) simde_vqdmlal_high_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_n_s32(a, b, c); + #else + return simde_vqdmlal_high_s32(a, b, simde_vdupq_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_n_s32 + #define vqdmlal_high_n_s32(a, b, c) simde_vqdmlal_high_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) */ diff --git a/arm/neon/qdmlal_lane.h b/arm/neon/qdmlal_lane.h new file mode 100644 index 000000000..14a663cd6 --- /dev/null +++ b/arm/neon/qdmlal_lane.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H) +#define SIMDE_ARM_NEON_QDMLAL_LANE_H + +#include "qdmlal.h" +#include "dup_lane.h" +#include "get_lane.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlal_lane_s16(a, b, v, lane) vqdmlal_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_lane_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_lane_s16 + #define vqdmlal_lane_s16(a, b, c, lane) simde_vqdmlal_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlal_lane_s32(a, b, v, lane) vqdmlal_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_lane_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_lane_s32 + #define vqdmlal_lane_s32(a, b, c, lane) simde_vqdmlal_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_laneq_s16(a, b, v, lane) vqdmlal_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_laneq_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_laneq_s16 + #define vqdmlal_laneq_s16(a, b, c, lane) simde_vqdmlal_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_laneq_s32(a, b, v, lane) vqdmlal_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_laneq_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_laneq_s32 + #define vqdmlal_laneq_s32(a, b, c, lane) simde_vqdmlal_laneq_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlalh_lane_s16(a, b, v, lane) vqdmlalh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlalh_lane_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_lane_s16 + #define vqdmlalh_lane_s16(a, b, c, lane) simde_vqdmlalh_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlalh_laneq_s16(a, b, v, lane) vqdmlalh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlalh_laneq_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_laneq_s16 + #define vqdmlalh_laneq_s16(a, b, c, lane) simde_vqdmlalh_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlals_lane_s32(a, b, v, lane) vqdmlals_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlals_lane_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_lane_s32 + #define vqdmlals_lane_s32(a, b, c, lane) simde_vqdmlals_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlals_laneq_s32(a, b, v, lane) vqdmlals_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlals_laneq_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_laneq_s32 + #define vqdmlals_laneq_s32(a, b, c, lane) simde_vqdmlals_laneq_s32((a), (b), (c), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H) */ diff --git a/arm/neon/qdmlal_n.h b/arm/neon/qdmlal_n.h new file mode 100644 index 000000000..0a5c69ea3 --- /dev/null +++ b/arm/neon/qdmlal_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_N_H) +#define SIMDE_ARM_NEON_QDMLAL_N_H + +#include "dup_n.h" +#include "qdmlal.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_n_s16(a, b, c); + #else + return simde_vqdmlal_s16(a, b, simde_vdup_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_n_s16 + #define vqdmlal_n_s16(a, b, c) simde_vqdmlal_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_n_s32(a, b, c); + #else + return simde_vqdmlal_s32(a, b, simde_vdup_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_n_s32 + #define vqdmlal_n_s32(a, b, c) simde_vqdmlal_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_N_H) */ diff --git a/arm/neon/qdmlsl.h b/arm/neon/qdmlsl.h new file mode 100644 index 000000000..b476572be --- /dev/null +++ b/arm/neon/qdmlsl.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_H) +#define SIMDE_ARM_NEON_QDMLSL_H + +#include "types.h" +#include "qsub.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlslh_s16(a, b, c); + #else + return simde_vqsubs_s32(a, simde_vqdmullh_s16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_s16 + #define vqdmlslh_s16(a, b, c) simde_vqdmlslh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsls_s32(a, b, c); + #else + return simde_vqsubd_s64(a, simde_vqdmulls_s32(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_s32 + #define vqdmlsls_s32(a, b, c) simde_vqdmlsls_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_s16(a, b, c); + #else + return simde_vqsubq_s32(a, simde_vqdmull_s16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_s16 + #define vqdmlsl_s16(a, b, c) simde_vqdmlsl_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_s32(a, b, c); + #else + return simde_vqsubq_s64(a, simde_vqdmull_s32(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_s32 + #define vqdmlsl_s32(a, b, c) simde_vqdmlsl_s32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_H) */ diff --git a/arm/neon/qdmlsl_high.h b/arm/neon/qdmlsl_high.h new file mode 100644 index 000000000..01722f290 --- /dev/null +++ b/arm/neon/qdmlsl_high.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_H + +#include "movl_high.h" +#include "types.h" +#include "qdmull_high.h" +#include "qsub.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_s16(a, b, c); + #else + return simde_vqsubq_s32(a, simde_vqdmull_high_s16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_s16 + #define vqdmlsl_high_s16(a, b, c) simde_vqdmlsl_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_s32(a, b, c); + #else + return simde_vqsubq_s64(a, simde_vqdmull_high_s32(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_s32 + #define vqdmlsl_high_s32(a, b, c) simde_vqdmlsl_high_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H) */ diff --git a/arm/neon/qdmlsl_high_lane.h b/arm/neon/qdmlsl_high_lane.h new file mode 100644 index 000000000..41962746f --- /dev/null +++ b/arm/neon/qdmlsl_high_lane.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H + +#include "dup_lane.h" +#include "get_high.h" +#include "types.h" +#include "qdmlsl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_lane_s16(a, b, v, lane) vqdmlsl_high_lane_s16(a, b, v, lane) +#else + #define simde_vqdmlsl_high_lane_s16(a, b, v, lane) simde_vqdmlsl_s16((a), simde_vget_high_s16((b)), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_lane_s16 + #define vqdmlsl_high_lane_s16(a, b, v, lane) simde_vqdmlsl_high_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_laneq_s16(a, b, v, lane) vqdmlsl_high_laneq_s16(a, b, v, lane) +#else + #define simde_vqdmlsl_high_laneq_s16(a, b, v, lane) simde_vqdmlsl_s16((a), simde_vget_high_s16((b)), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_laneq_s16 + #define vqdmlsl_high_laneq_s16(a, b, v, lane) simde_vqdmlsl_high_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_lane_s32(a, b, v, lane) vqdmlsl_high_lane_s32(a, b, v, lane) +#else + #define simde_vqdmlsl_high_lane_s32(a, b, v, lane) simde_vqdmlsl_s32((a), simde_vget_high_s32((b)), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_lane_s32 + #define vqdmlsl_high_lane_s32(a, b, v, lane) simde_vqdmlsl_high_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_laneq_s32(a, b, v, lane) vqdmlsl_high_laneq_s32(a, b, v, lane) +#else + #define simde_vqdmlsl_high_laneq_s32(a, b, v, lane) simde_vqdmlsl_s32((a), simde_vget_high_s32((b)), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_laneq_s32 + #define vqdmlsl_high_laneq_s32(a, b, v, lane) simde_vqdmlsl_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H) */ diff --git a/arm/neon/qdmlsl_high_n.h b/arm/neon/qdmlsl_high_n.h new file mode 100644 index 000000000..901e9a1b0 --- /dev/null +++ b/arm/neon/qdmlsl_high_n.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_N_H + +#include "movl_high.h" +#include "dup_n.h" +#include "types.h" +#include "qdmlsl_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_n_s16(a, b, c); + #else + return simde_vqdmlsl_high_s16(a, b, simde_vdupq_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_n_s16 + #define vqdmlsl_high_n_s16(a, b, c) simde_vqdmlsl_high_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_n_s32(a, b, c); + #else + return simde_vqdmlsl_high_s32(a, b, simde_vdupq_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_n_s32 + #define vqdmlsl_high_n_s32(a, b, c) simde_vqdmlsl_high_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H) */ diff --git a/arm/neon/qdmlsl_lane.h b/arm/neon/qdmlsl_lane.h new file mode 100644 index 000000000..d93677da0 --- /dev/null +++ b/arm/neon/qdmlsl_lane.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_LANE_H) +#define SIMDE_ARM_NEON_QDMLSL_LANE_H + +#include "qdmlsl.h" +#include "dup_lane.h" +#include "get_lane.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlsl_lane_s16(a, b, v, lane) vqdmlsl_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_lane_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_lane_s16 + #define vqdmlsl_lane_s16(a, b, c, lane) simde_vqdmlsl_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlsl_lane_s32(a, b, v, lane) vqdmlsl_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_lane_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_lane_s32 + #define vqdmlsl_lane_s32(a, b, c, lane) simde_vqdmlsl_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_laneq_s16(a, b, v, lane) vqdmlsl_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_laneq_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_laneq_s16 + #define vqdmlsl_laneq_s16(a, b, c, lane) simde_vqdmlsl_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_laneq_s32(a, b, v, lane) vqdmlsl_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_laneq_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_laneq_s32 + #define vqdmlsl_laneq_s32(a, b, c, lane) simde_vqdmlsl_laneq_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlslh_lane_s16(a, b, v, lane) vqdmlslh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlslh_lane_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_lane_s16 + #define vqdmlslh_lane_s16(a, b, c, lane) simde_vqdmlslh_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlslh_laneq_s16(a, b, v, lane) vqdmlslh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlslh_laneq_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_laneq_s16 + #define vqdmlslh_laneq_s16(a, b, c, lane) simde_vqdmlslh_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsls_lane_s32(a, b, v, lane) vqdmlsls_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsls_lane_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_lane_s32 + #define vqdmlsls_lane_s32(a, b, c, lane) simde_vqdmlsls_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsls_laneq_s32(a, b, v, lane) vqdmlsls_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsls_laneq_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_laneq_s32 + #define vqdmlsls_laneq_s32(a, b, c, lane) simde_vqdmlsls_laneq_s32((a), (b), (c), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDmlsl_LANE_H) */ diff --git a/arm/neon/qdmlsl_n.h b/arm/neon/qdmlsl_n.h new file mode 100644 index 000000000..5707f4c47 --- /dev/null +++ b/arm/neon/qdmlsl_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_N_H) +#define SIMDE_ARM_NEON_QDMLSL_N_H + +#include "dup_n.h" +#include "qdmlsl.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_n_s16(a, b, c); + #else + return simde_vqdmlsl_s16(a, b, simde_vdup_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_n_s16 + #define vqdmlsl_n_s16(a, b, c) simde_vqdmlsl_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_n_s32(a, b, c); + #else + return simde_vqdmlsl_s32(a, b, simde_vdup_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_n_s32 + #define vqdmlsl_n_s32(a, b, c) simde_vqdmlsl_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_N_H) */ diff --git a/arm/neon/qdmulh.h b/arm/neon/qdmulh.h index d42e393ad..40d187cd9 100644 --- a/arm/neon/qdmulh.h +++ b/arm/neon/qdmulh.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QDMULH_H) @@ -52,7 +53,7 @@ simde_vqdmulhs_s32(int32_t a, int32_t b) { } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmulhs_s32 - #define vqdmulhs_s32(a) simde_vqdmulhs_s32((a)) + #define vqdmulhs_s32(a, b) simde_vqdmulhs_s32((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -63,7 +64,7 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) { #else simde_int16x4_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) simde_int16x8_private tmp_ = simde_int16x8_to_private( simde_vreinterpretq_s16_s32( @@ -89,6 +90,21 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) { #define vqdmulh_s16(a, b) simde_vqdmulh_s16((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqdmulhh_s16(int16_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmulhh_s16(a, b); + #else + int32_t tmp = simde_vqdmullh_s16(a, b); + return HEDLEY_STATIC_CAST(int16_t, tmp >> 16); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_s16 + #define vqdmulhh_s16(a, b) simde_vqdmulhh_s16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int32x2_t simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) { @@ -97,7 +113,7 @@ simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) { #else simde_int32x2_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) simde_int32x4_private tmp_ = simde_int32x4_to_private( simde_vreinterpretq_s32_s64( diff --git a/arm/neon/qdmulh_lane.h b/arm/neon/qdmulh_lane.h index 3120eb7ad..32cd22dea 100644 --- a/arm/neon/qdmulh_lane.h +++ b/arm/neon/qdmulh_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QDMULH_LANE_H) @@ -37,6 +38,17 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulhh_lane_s16(a, v, lane) vqdmulhh_lane_s16((a), (v), (lane)) +#else + #define simde_vqdmulhh_lane_s16(a, v, lane) \ + simde_vqdmulhh_s16((a), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_lane_s16 + #define vqdmulhh_lane_s16(a, v, lane) simde_vqdmulhh_lane_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_vqdmulh_lane_s16(a, v, lane) vqdmulh_lane_s16((a), (v), (lane)) #else @@ -81,6 +93,17 @@ SIMDE_BEGIN_DECLS_ #define vqdmulhq_lane_s32(a, v, lane) simde_vqdmulhq_lane_s32((a), (v), (lane)) #endif +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulhh_laneq_s16(a, v, lane) vqdmulhh_laneq_s16((a), (v), (lane)) +#else + #define simde_vqdmulhh_laneq_s16(a, v, lane) \ + simde_vqdmulhh_s16((a), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_laneq_s16 + #define vqdmulhh_laneq_s16(a, v, lane) simde_vqdmulhh_laneq_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmulh_laneq_s16(a, v, lane) vqdmulh_laneq_s16((a), (v), (lane)) #else diff --git a/arm/neon/qdmull.h b/arm/neon/qdmull.h index 88bf50bcb..b374cd4f8 100644 --- a/arm/neon/qdmull.h +++ b/arm/neon/qdmull.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ /* Implementation notes (seanptmaher): @@ -63,12 +65,12 @@ simde_vqdmulls_s32(int32_t a, int32_t b) { return vqdmulls_s32(a, b); #else int64_t mul = (HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b)); - return ((a > 0 ? a : -a) & (HEDLEY_STATIC_CAST(int64_t, 1) << 62)) ? ((mul < 0) ? INT64_MIN : INT64_MAX) : mul << 1; + return (simde_math_llabs(mul) & (HEDLEY_STATIC_CAST(int64_t, 1) << 62)) ? ((mul < 0) ? INT64_MIN : INT64_MAX) : mul << 1; #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vqdmulls_s16 - #define vqdmulls_s16(a, b) simde_vqdmulls_s16((a), (b)) + #undef vqdmulls_s32 + #define vqdmulls_s32(a, b) simde_vqdmulls_s32((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -96,11 +98,17 @@ simde_vqdmull_s16(simde_int16x4_t a, simde_int16x4_t b) { simde_int16x4_private a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t mul = __riscv_vwmul_vv_i32m2(a_.sv64, b_.sv64, 4); + r_.sv128 = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vmerge_vxm_i32m2(__riscv_vmerge_vxm_i32m2( + __riscv_vsll_vx_i32m2(mul, 1, 4), INT32_MAX, __riscv_vmsgt_vx_i32m2_b16(mul, INT32_C(0x3FFFFFFF), 4), 4), + INT32_MIN, __riscv_vmslt_vx_i32m2_b16(mul, -INT32_C(0x40000000), 4), 4)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[i]); + } + #endif return simde_int32x4_from_private(r_); #endif @@ -136,10 +144,17 @@ simde_vqdmull_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t mul = __riscv_vwmul_vv_i64m2(a_.sv64, b_.sv64, 2); + r_.sv128 = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vmerge_vxm_i64m2(__riscv_vmerge_vxm_i64m2( + __riscv_vsll_vx_i64m2(mul, 1, 2), INT64_MAX, __riscv_vmsgt_vx_i64m2_b32(mul, INT64_C(0x3FFFFFFFFFFFFFFF), 2), 2), + INT64_MIN, __riscv_vmslt_vx_i64m2_b32(mul, -INT64_C(0x4000000000000000), 2), 2)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[i]); + } + #endif return simde_int64x2_from_private(r_); #endif diff --git a/arm/neon/qdmull_high.h b/arm/neon/qdmull_high.h new file mode 100644 index 000000000..2c6b26912 --- /dev/null +++ b/arm/neon/qdmull_high.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_H + +#include "combine.h" +#include "get_high.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_s16(a, b); + #else + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_s16 + #define vqdmull_high_s16(a, b) simde_vqdmull_high_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_s32(a, b); + #else + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_s32 + #define vqdmull_high_s32(a, b) simde_vqdmull_high_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H) */ diff --git a/arm/neon/qdmull_high_lane.h b/arm/neon/qdmull_high_lane.h new file mode 100644 index 000000000..f8326b2bf --- /dev/null +++ b/arm/neon/qdmull_high_lane.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H + +#include "combine.h" +#include "qdmull.h" +#include "dup_n.h" +#include "get_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_lane_s16(a, v, lane) vqdmull_high_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_lane_s16 + #define vqdmull_high_lane_s16(a, v, lane) simde_vqdmull_high_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_laneq_s16(a, v, lane) vqdmull_high_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_laneq_s16 + #define vqdmull_high_laneq_s16(a, v, lane) simde_vqdmull_high_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_lane_s32(a, v, lane) vqdmull_high_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_lane_s32 + #define vqdmull_high_lane_s32(a, v, lane) simde_vqdmull_high_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_laneq_s32(a, v, lane) vqdmull_high_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_laneq_s32 + #define vqdmull_high_laneq_s32(a, v, lane) simde_vqdmull_high_laneq_s32((a), (v), (lane)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */ diff --git a/arm/neon/qdmull_high_n.h b/arm/neon/qdmull_high_n.h new file mode 100644 index 000000000..aef31240f --- /dev/null +++ b/arm/neon/qdmull_high_n.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_N_H + +#include "combine.h" +#include "get_high.h" +#include "dup_n.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_n_s16(simde_int16x8_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_n_s16(a, b); + #else + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_n_s16 + #define vqdmull_high_n_s16(a, b) simde_vqdmull_high_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_n_s32(simde_int32x4_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_n_s32(a, b); + #else + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_n_s32 + #define vqdmull_high_n_s32(a, b) simde_vqdmull_high_n_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H) */ diff --git a/arm/neon/qdmull_lane.h b/arm/neon/qdmull_lane.h new file mode 100644 index 000000000..a7bf68cbd --- /dev/null +++ b/arm/neon/qdmull_lane.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_LANE_H) +#define SIMDE_ARM_NEON_QDMULL_LANE_H + +#include "combine.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmullh_lane_s16(int16_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + + return simde_vqdmullh_s16(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmullh_lane_s16(a, v, lane) vqdmullh_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmullh_lane_s16 + #define vqdmullh_lane_s16(a, v, lane) simde_vqdmullh_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmullh_laneq_s16(int16_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + + return simde_vqdmullh_s16(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmullh_laneq_s16(a, v, lane) vqdmullh_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmullh_laneq_s16 + #define vqdmullh_laneq_s16(a, v, lane) simde_vqdmullh_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmulls_lane_s32(int32_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + + return simde_vqdmulls_s32(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulls_lane_s32(a, v, lane) vqdmulls_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulls_lane_s32 + #define vqdmulls_lane_s32(a, v, lane) simde_vqdmulls_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmulls_laneq_s32(int32_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + + return simde_vqdmulls_s32(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulls_laneq_s32(a, v, lane) vqdmulls_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulls_laneq_s32 + #define vqdmulls_laneq_s32(a, v, lane) simde_vqdmulls_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private r_; + simde_int16x4_private + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmull_lane_s16(a, v, lane) vqdmull_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_lane_s16 + #define vqdmull_lane_s16(a, v, lane) simde_vqdmull_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int32x4_private r_; + simde_int16x4_private + a_ = simde_int16x4_to_private(a); + simde_int16x8_private + b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_laneq_s16(a, v, lane) vqdmull_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_laneq_s16 + #define vqdmull_laneq_s16(a, v, lane) simde_vqdmull_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2_private r_; + simde_int32x2_private + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmull_lane_s32(a, v, lane) vqdmull_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_lane_s32 + #define vqdmull_lane_s32(a, v, lane) simde_vqdmull_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int64x2_private r_; + simde_int32x2_private + a_ = simde_int32x2_to_private(a); + simde_int32x4_private + b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_laneq_s32(a, v, lane) vqdmull_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_laneq_s32 + #define vqdmull_laneq_s32(a, v, lane) simde_vqdmull_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_H) */ diff --git a/arm/neon/qdmull_n.h b/arm/neon/qdmull_n.h new file mode 100644 index 000000000..691802637 --- /dev/null +++ b/arm/neon/qdmull_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_N_H) +#define SIMDE_ARM_NEON_QDMULL_N_H + +#include "combine.h" +#include "dup_n.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_n_s16(simde_int16x4_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmull_n_s16(a, b); + #else + return simde_vqdmull_s16(a, simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_n_s16 + #define vqdmull_n_s16(a, b) simde_vqdmull_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_n_s32(simde_int32x2_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmull_n_s32(a, b); + #else + return simde_vqdmull_s32(a, simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_n_s32 + #define vqdmull_n_s32(a, b) simde_vqdmull_n_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_N_H) */ diff --git a/arm/neon/qmovun_high.h b/arm/neon/qmovun_high.h new file mode 100644 index 000000000..edb3e17a4 --- /dev/null +++ b/arm/neon/qmovun_high.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H) +#define SIMDE_ARM_NEON_QMOVUN_HIGH_H + +#include "types.h" + +#include "combine.h" +#include "qmovun.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqmovun_high_s16(simde_uint8x8_t r, simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s16(r, a); + #else + return simde_vcombine_u8(r, simde_vqmovun_s16(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s16 + #define vqmovun_high_s16(r, a) simde_vqmovun_high_s16((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqmovun_high_s32(simde_uint16x4_t r, simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s32(r, a); + #else + return simde_vcombine_u16(r, simde_vqmovun_s32(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s32 + #define vqmovun_high_s32(r, a) simde_vqmovun_high_s32((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqmovun_high_s64(simde_uint32x2_t r, simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s64(r, a); + #else + return simde_vcombine_u32(r, simde_vqmovun_s64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s64 + #define vqmovun_high_s64(r, a) simde_vqmovun_high_s64((r), (a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H) */ diff --git a/arm/neon/qrdmlah.h b/arm/neon/qrdmlah.h new file mode 100644 index 000000000..b74f40db0 --- /dev/null +++ b/arm/neon/qrdmlah.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLAH_H) +#define SIMDE_ARM_NEON_QRDMLAH_H + +#include "types.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + return SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vqrdmlahh_s16(a, b, c)); + #else + return vqrdmlahh_s16(a, b, c); + #endif + #else + int64_t r = (((1 << 15) + (HEDLEY_STATIC_CAST(int64_t, a) << 16) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)))) << 1)) >> 16); + return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahh_s16 + #define vqrdmlahh_s16(a, b, c) simde_vqrdmlahh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahs_s32(a, b, c); + #else + int64_t round_const = (HEDLEY_STATIC_CAST(int64_t, 1) << 31); + int64_t a_ = (HEDLEY_STATIC_CAST(int64_t, a) << 32); + int64_t sum = round_const + a_; + int64_t mul = (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)); + int64_t mul2 = mul << 1; + if (mul2 >> 1 != mul) { + if (mul > 0) return INT32_MAX; + else if (mul < 0) return INT32_MIN; + } + int64_t sum2 = sum + mul2; + if (sum > 0 && INT64_MAX - sum < mul2) return INT32_MAX; + if (sum < 0 && INT64_MIN - sum > mul2) return INT32_MIN; + return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahs_s32 + #define vqrdmlahs_s32(a, b, c) simde_vqrdmlahs_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrdmlah_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlah_s16(a, b, c); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_s16 + #define vqrdmlah_s16(a, b, c) simde_vqrdmlah_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrdmlah_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlah_s32(a, b, c); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_s32 + #define vqrdmlah_s32(a, b, c) simde_vqrdmlah_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrdmlahq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahq_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_s16 + #define vqrdmlahq_s16(a, b, c) simde_vqrdmlahq_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrdmlahq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahq_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_s32 + #define vqrdmlahq_s32(a, b, c) simde_vqrdmlahq_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_H) */ diff --git a/arm/neon/qrdmlah_lane.h b/arm/neon/qrdmlah_lane.h new file mode 100644 index 000000000..61ed84934 --- /dev/null +++ b/arm/neon/qrdmlah_lane.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H) +#define SIMDE_ARM_NEON_QRDMLAH_LANE_H + +#include "types.h" +#include "qrdmlah.h" +#include "dup_lane.h" +#include "get_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahh_lane_s16(a, b, v, lane) vqrdmlahh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahh_lane_s16 + #define vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) vqrdmlahh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahh_laneq_s16 + #define vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahs_lane_s32(a, b, v, lane) vqrdmlahs_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahs_lane_s32 + #define vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) vqrdmlahs_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahs_laneq_s32 + #define vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_lane_s16(a, b, v, lane) vqrdmlah_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_lane_s16 + #define vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_lane_s32(a, b, v, lane) vqrdmlah_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_lane_s32 + #define vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_lane_s16(a, b, v, lane) vqrdmlahq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_lane_s16 + #define vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_lane_s32(a, b, v, lane) vqrdmlahq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_lane_s32 + #define vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_laneq_s16(a, b, v, lane) vqrdmlah_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_laneq_s16 + #define vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_laneq_s32(a, b, v, lane) vqrdmlah_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlah_laneq_s32 + #define vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) vqrdmlahq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_laneq_s16 + #define vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) vqrdmlahq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlahq_laneq_s32 + #define vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H) */ diff --git a/arm/neon/qrdmlsh.h b/arm/neon/qrdmlsh.h new file mode 100644 index 000000000..71cd1d015 --- /dev/null +++ b/arm/neon/qrdmlsh.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLSH_H) +#define SIMDE_ARM_NEON_QRDMLSH_H + +#include "types.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + return SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vqrdmlshh_s16(a, b, c)); + #else + return vqrdmlshh_s16(a, b, c); + #endif + #else + int64_t r = (((1 << 15) + (HEDLEY_STATIC_CAST(int64_t, a) << 16) - ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)))) << 1)) >> 16); + return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshh_s16 + #define vqrdmlshh_s16(a, b, c) simde_vqrdmlshh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshs_s32(a, b, c); + #else + int64_t round_const = (HEDLEY_STATIC_CAST(int64_t, 1) << 31); + int64_t a_ = (HEDLEY_STATIC_CAST(int64_t, a) << 32); + int64_t sum = round_const + a_; + int64_t mul = -(HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)); + int64_t mul2 = mul << 1; + if (mul2 >> 1 != mul) { + if (mul > 0) return INT32_MAX; + else if (mul < 0) return INT32_MIN; + } + int64_t sum2 = sum + mul2; + if (sum > 0 && INT64_MAX - sum < mul2) return INT32_MAX; + if (sum < 0 && INT64_MIN - sum > mul2) return INT32_MIN; + return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshs_s32 + #define vqrdmlshs_s32(a, b, c) simde_vqrdmlshs_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrdmlsh_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlsh_s16(a, b, c); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_s16 + #define vqrdmlsh_s16(a, b, c) simde_vqrdmlsh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrdmlsh_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlsh_s32(a, b, c); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_s32 + #define vqrdmlsh_s32(a, b, c) simde_vqrdmlsh_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrdmlshq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshq_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_s16 + #define vqrdmlshq_s16(a, b, c) simde_vqrdmlshq_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrdmlshq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshq_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_s32 + #define vqrdmlshq_s32(a, b, c) simde_vqrdmlshq_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLSH_H) */ diff --git a/arm/neon/qrdmlsh_lane.h b/arm/neon/qrdmlsh_lane.h new file mode 100644 index 000000000..b40a39a9d --- /dev/null +++ b/arm/neon/qrdmlsh_lane.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLSH_LANE_H) +#define SIMDE_ARM_NEON_QRDMLSH_LANE_H + +#include "types.h" +#include "qrdmlsh.h" +#include "dup_lane.h" +#include "get_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshh_lane_s16(a, b, v, lane) vqrdmlshh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshh_lane_s16 + #define vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshh_laneq_s16(a, b, v, lane) vqrdmlshh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshh_laneq_s16 + #define vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshs_lane_s32(a, b, v, lane) vqrdmlshs_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshs_lane_s32 + #define vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshs_laneq_s32(a, b, v, lane) vqrdmlshs_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshs_laneq_s32 + #define vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_lane_s16(a, b, v, lane) vqrdmlsh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_lane_s16 + #define vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_lane_s32(a, b, v, lane) vqrdmlsh_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_lane_s32 + #define vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_lane_s16(a, b, v, lane) vqrdmlshq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_lane_s16 + #define vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_lane_s32(a, b, v, lane) vqrdmlshq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_lane_s32 + #define vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_laneq_s16(a, b, v, lane) vqrdmlsh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_laneq_s16 + #define vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_laneq_s32(a, b, v, lane) vqrdmlsh_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlsh_laneq_s32 + #define vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_laneq_s16(a, b, v, lane) vqrdmlshq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_laneq_s16 + #define vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_laneq_s32(a, b, v, lane) vqrdmlshq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) + #undef vqrdmlshq_laneq_s32 + #define vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLSH_LANE_H) */ diff --git a/arm/neon/qrdmulh.h b/arm/neon/qrdmulh.h index 9a69b92e5..12e16a146 100644 --- a/arm/neon/qrdmulh.h +++ b/arm/neon/qrdmulh.h @@ -40,7 +40,10 @@ simde_vqrdmulhh_s16(int16_t a, int16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhh_s16(a, b); #else - return HEDLEY_STATIC_CAST(int16_t, (((1 << 15) + ((HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b)))) << 1)) >> 16) & 0xffff); + int32_t temp = HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b); + int32_t r = temp > 0 ? (temp > (INT32_MAX >> 1) ? INT32_MAX : (temp << 1)) : (temp < (INT32_MIN >> 1) ? INT32_MIN : (temp << 1)); + r = (r > (INT32_MAX - (1 << 15))) ? INT32_MAX : ((1 << 15) + r); + return HEDLEY_STATIC_CAST(int16_t, ((r >> 16) & 0xffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -54,7 +57,10 @@ simde_vqrdmulhs_s32(int32_t a, int32_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhs_s32(a, b); #else - return HEDLEY_STATIC_CAST(int32_t, (((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b)))) << 1)) >> 32) & 0xffffffff); + int64_t temp = HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b); + int64_t r = temp > 0 ? (temp > (INT64_MAX >> 1) ? INT64_MAX : (temp << 1)) : (temp < (INT64_MIN >> 1) ? INT64_MIN : (temp << 1)); + r = (r > (INT64_MAX - (HEDLEY_STATIC_CAST(int64_t, 1) << 31))) ? INT64_MAX : ((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + r); + return HEDLEY_STATIC_CAST(int32_t, ((r >> 32) & 0xffffffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -122,29 +128,8 @@ simde_vqrdmulhq_s16(simde_int16x8_t a, simde_int16x8_t b) { a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - /* https://github.com/WebAssembly/simd/pull/365 */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vqrdmulhq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i y = _mm_mulhrs_epi16(a_.m128i, b_.m128i); - __m128i tmp = _mm_cmpeq_epi16(y, _mm_set1_epi16(INT16_MAX)); - r_.m128i = _mm_xor_si128(y, tmp); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i prod_lo = _mm_mullo_epi16(a_.m128i, b_.m128i); - const __m128i prod_hi = _mm_mulhi_epi16(a_.m128i, b_.m128i); - const __m128i tmp = - _mm_add_epi16( - _mm_avg_epu16( - _mm_srli_epi16(prod_lo, 14), - _mm_setzero_si128() - ), - _mm_add_epi16(prod_hi, prod_hi) - ); - r_.m128i = - _mm_xor_si128( - tmp, - _mm_cmpeq_epi16(_mm_set1_epi16(INT16_MAX), tmp) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/qrdmulh_lane.h b/arm/neon/qrdmulh_lane.h index 507064eab..2e7f548ec 100644 --- a/arm/neon/qrdmulh_lane.h +++ b/arm/neon/qrdmulh_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRDMULH_LANE_H) @@ -36,6 +37,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrdmulhh_lane_s16(a, v, lane) vqrdmulhh_lane_s16((a), (v), (lane)) +#else + #define simde_vqrdmulhh_lane_s16(a, v, lane) simde_vqrdmulhh_s16((a), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmulhh_lane_s16 + #define vqrdmulhh_lane_s16(a, v, lane) simde_vqrdmulhh_lane_s16((a), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrdmulhh_laneq_s16(a, v, lane) vqrdmulhh_laneq_s16((a), (v), (lane)) +#else + #define simde_vqrdmulhh_laneq_s16(a, v, lane) simde_vqrdmulhh_s16((a), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmulhh_laneq_s16 + #define vqrdmulhh_laneq_s16(a, v, lane) simde_vqrdmulhh_laneq_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vqrdmulhs_lane_s32(a, v, lane) \ diff --git a/arm/neon/qrshl.h b/arm/neon/qrshl.h new file mode 100644 index 000000000..74b8f47de --- /dev/null +++ b/arm/neon/qrshl.h @@ -0,0 +1,750 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHL_H) +#define SIMDE_ARM_NEON_QRSHL_H +#include "../../x86/avx.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vqrshlb_s8(int8_t a, int8_t b) { + int8_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshlb_s8(a, b); + #else + if (b < -8) { + r = 0; + } else if (b < 0) { + r = HEDLEY_STATIC_CAST(int8_t, a <= 0 + ? ((a + (1 << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a + (1 << (-b - 1)))) >> -b) & 0x7FUL))); + } else if (b == 0) { + r = a; + } else if (b < 7) { + r = HEDLEY_STATIC_CAST(int8_t, a << b); + if ((r >> b) != a) { + r = (a < 0) ? INT8_MIN : INT8_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT8_MIN : INT8_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlb_s8 + #define vqrshlb_s8(a, b) simde_vqrshlb_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrshlh_s16(int16_t a, int16_t b) { + int16_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshlh_s16(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -16) { + r = 0; + } else if (b8 < 0) { + r = HEDLEY_STATIC_CAST(int16_t, a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFUL))); + } else if (b8 == 0) { + r = a; + } else if (b8 < 15) { + r = HEDLEY_STATIC_CAST(int16_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT16_MIN : INT16_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT16_MIN : INT16_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlh_s16 + #define vqrshlh_s16(a, b) simde_vqrshlh_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrshls_s32(int32_t a, int32_t b) { + int32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshls_s32(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -32) { + r = 0; + } else if (b8 < 0) { + r = a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFUL)); + } else if (b8 == 0) { + r = a; + } else if (b8 < 31) { + r = HEDLEY_STATIC_CAST(int32_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT32_MIN : INT32_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT32_MIN : INT32_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshls_s32 + #define vqrshls_s32(a, b) simde_vqrshls_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqrshld_s64(int64_t a, int64_t b) { + int64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshld_s64(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -64) { + r = 0; + } else if (b8 < 0) { + r = a <= 0 + ? ((a + (INT64_C(1) << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int64_t, ((HEDLEY_STATIC_CAST(uint64_t, + (a + (INT64_C(1) << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFFFFFFFFFUL)); + } else if (b8 == 0) { + r = a; + } else if (b8 < 63) { + r = HEDLEY_STATIC_CAST(int64_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT64_MIN : INT64_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT64_MIN : INT64_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshld_s64 + #define vqrshld_s64(a, b) simde_vqrshld_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vqrshlb_u8(uint8_t a, int8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshlb_u8(a, HEDLEY_STATIC_CAST(uint8_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + /* https://github.com/llvm/llvm-project/commit/f0a78bdfdc6d56b25e0081884580b3960a3c2429 */ + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshlb_u8(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshlb_u8(a, b); + #endif + #else + uint8_t r; + + if (b < -8) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 8) { + r = HEDLEY_STATIC_CAST(uint8_t, a << b); + if ((r >> b) != a) { + r = UINT8_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT8_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlb_u8 + #define vqrshlb_u8(a, b) simde_vqrshlb_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqrshlh_u16(uint16_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshlh_u16(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshlh_u16(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshlh_u16(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint16_t r; + + if (b < -16) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 16) { + r = HEDLEY_STATIC_CAST(uint16_t, a << b); + if ((r >> b) != a) { + r = UINT16_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT16_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlh_u16 + #define vqrshlh_u16(a, b) simde_vqrshlh_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vqrshls_u32(uint32_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshls_u32(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshls_u32(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshls_u32(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint32_t r; + + if (b < -32) { + r = 0; + } else if (b < 0) { + if (b == -32) + r = (a >> 31) & 1; + else + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 32) { + r = HEDLEY_STATIC_CAST(uint32_t, a << b); + if ((r >> b) != a) { + r = UINT32_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT32_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshls_u32 + #define vqrshls_u32(a, b) simde_vqrshls_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vqrshld_u64(uint64_t a, int64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshld_u64(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshld_u64(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshld_u64(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint64_t r; + + if (b < -64) { + r = 0; + } else if (b < 0) { + if (b == -64) + r = (a >> 63) & 1; + else + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 64) { + r = HEDLEY_STATIC_CAST(uint64_t, a << b); + if ((r >> b) != a) { + r = UINT64_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT64_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshld_u64 + #define vqrshld_u64(a, b) simde_vqrshld_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vqrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s8(a, b); + #else + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_s8(a_.values[i], b_.values[i]); + } + + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s8 + #define vqrshl_s8(a, b) simde_vqrshl_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s16(a, b); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_s16(a_.values[i], b_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s16 + #define vqrshl_s16(a, b) simde_vqrshl_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s32(a, b); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_s32(a_.values[i], b_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s32 + #define vqrshl_s32(a, b) simde_vqrshl_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vqrshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s64(a, b); + #else + simde_int64x1_private + r_, + a_ = simde_int64x1_to_private(a), + b_ = simde_int64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_s64(a_.values[i], b_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s64 + #define vqrshl_s64(a, b) simde_vqrshl_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vqrshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u8(a, b); + #else + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_u8(a_.values[i], b_.values[i]); + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u8 + #define vqrshl_u8(a, b) simde_vqrshl_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vqrshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u16(a, b); + #else + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_u16(a_.values[i], b_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u16 + #define vqrshl_u16(a, b) simde_vqrshl_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vqrshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u32(a, b); + #else + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_u32(a_.values[i], b_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u32 + #define vqrshl_u32(a, b) simde_vqrshl_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vqrshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u64(a, b); + #else + simde_uint64x1_private + r_, + a_ = simde_uint64x1_to_private(a); + simde_int64x1_private b_ = simde_int64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_u64(a_.values[i], b_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u64 + #define vqrshl_u64(a, b) simde_vqrshl_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s8(a, b); + #else + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_s8(a_.values[i], b_.values[i]); + } + + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s8 + #define vqrshlq_s8(a, b) simde_vqrshlq_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_s16(a_.values[i], b_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s16 + #define vqrshlq_s16(a, b) simde_vqrshlq_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_s32(a_.values[i], b_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s32 + #define vqrshlq_s32(a, b) simde_vqrshlq_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqrshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_s64(a_.values[i], b_.values[i]); + } + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s64 + #define vqrshlq_s64(a, b) simde_vqrshlq_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u8(a, b); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_u8(a_.values[i], b_.values[i]); + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u8 + #define vqrshlq_u8(a, b) simde_vqrshlq_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_u16(a_.values[i], b_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u16 + #define vqrshlq_u16(a, b) simde_vqrshlq_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_u32(a_.values[i], b_.values[i]); + } + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u32 + #define vqrshlq_u32(a, b) simde_vqrshlq_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vqrshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + simde_int64x2_private b_ = simde_int64x2_to_private(b); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_u64(a_.values[i], b_.values[i]); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u64 + #define vqrshlq_u64(a, b) simde_vqrshlq_u64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRSHL_H) */ diff --git a/arm/neon/qrshrn_high_n.h b/arm/neon/qrshrn_high_n.h new file mode 100644 index 000000000..0080e739a --- /dev/null +++ b/arm/neon/qrshrn_high_n.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_QRSHRN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqrshrn_high_n_s16(simde_int8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > INT8_MAX) tmp = INT8_MAX; + else if (tmp < INT8_MIN) tmp = INT8_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp); + } + return simde_vcombine_s8(r, simde_vqmovn_s16(simde_int16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s16(r, a, n) vqrshrn_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s16 + #define vqrshrn_high_n_s16(r, a, n) simde_vqrshrn_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrshrn_high_n_s32(simde_int16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > INT16_MAX) tmp = INT16_MAX; + else if (tmp < INT16_MIN) tmp = INT16_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, tmp); + } + return simde_vcombine_s16(r, simde_vqmovn_s32(simde_int32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s32(r, a, n) vqrshrn_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s32 + #define vqrshrn_high_n_s32(r, a, n) simde_vqrshrn_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrshrn_high_n_s64(simde_int32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > INT32_MAX) tmp = INT32_MAX; + else if (tmp < INT32_MIN) tmp = INT32_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp); + } + return simde_vcombine_s32(r, simde_vqmovn_s64(simde_int64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s64(r, a, n) vqrshrn_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s64 + #define vqrshrn_high_n_s64(r, a, n) simde_vqrshrn_high_n_s64((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshrn_high_n_u16(simde_uint8x8_t r, simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t tmp = HEDLEY_STATIC_CAST(uint16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u16(r, a, n) vqrshrn_high_n_u16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u16 + #define vqrshrn_high_n_u16(r, a, n) simde_vqrshrn_high_n_u16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshrn_high_n_u32(simde_uint16x4_t r, simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(uint32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u32(r, a, n) vqrshrn_high_n_u32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u32 + #define vqrshrn_high_n_u32(r, a, n) simde_vqrshrn_high_n_u32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshrn_high_n_u64(simde_uint32x2_t r, simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(uint64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u64(r, a, n) vqrshrn_high_n_u64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u64 + #define vqrshrn_high_n_u64(r, a, n) simde_vqrshrn_high_n_u64((r), (a), (n)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) */ diff --git a/arm/neon/qrshrn_n.h b/arm/neon/qrshrn_n.h index f5864ae00..75f0a846c 100644 --- a/arm/neon/qrshrn_n.h +++ b/arm/neon/qrshrn_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRSHRN_N_H) @@ -35,6 +36,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrnh_n_s16(a, n) vqrshrnh_n_s16(a, n) +#else + #define simde_vqrshrnh_n_s16(a, n) simde_vqmovnh_s16(simde_x_vrshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrnh_n_s16 + #define vqrshrnh_n_s16(a, n) simde_vqrshrnh_n_s16(a, n) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrnh_n_u16(a, n) vqrshrnh_n_u16(a, n) +#else + #define simde_vqrshrnh_n_u16(a, n) simde_vqmovnh_u16(simde_x_vrshrh_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrnh_n_u16 + #define vqrshrnh_n_u16(a, n) simde_vqrshrnh_n_u16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqrshrns_n_s32(a, n) vqrshrns_n_s32(a, n) #else diff --git a/arm/neon/qrshrun_high_n.h b/arm/neon/qrshrun_high_n.h new file mode 100644 index 000000000..a06abe776 --- /dev/null +++ b/arm/neon/qrshrun_high_n.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H) +#define SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s16(r, a, n) vqrshrun_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) + #undef vqrshrun_high_n_s16 + #define vqrshrun_high_n_s16(r, a, n) simde_vqrshrun_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s32(r, a, n) vqrshrun_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) + #undef vqrshrun_high_n_s32 + #define vqrshrun_high_n_s32(r, a, n) simde_vqrshrun_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s64(r, a, n) vqrshrun_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) + #undef vqrshrun_high_n_s64 + #define vqrshrun_high_n_s64(r, a, n) simde_vqrshrun_high_n_s64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H) */ diff --git a/arm/neon/qrshrun_n.h b/arm/neon/qrshrun_n.h index 8903d9ffb..7eac18054 100644 --- a/arm/neon/qrshrun_n.h +++ b/arm/neon/qrshrun_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRSHRUN_N_H) @@ -36,7 +37,11 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define simde_vqrshruns_n_s32(a, n) vqrshruns_n_s32(a, n) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshruns_n_s32(a, n) HEDLEY_STATIC_CAST(uint16_t, vqrshruns_n_s32((a), (n))) + #else + #define simde_vqrshruns_n_s32(a, n) vqrshruns_n_s32((a), (n)) + #endif #else #define simde_vqrshruns_n_s32(a, n) simde_vqmovuns_s32(simde_x_vrshrs_n_s32(a, n)) #endif @@ -46,15 +51,33 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define simde_vqrshrund_n_s64(a, n) vqrshrund_n_s64(a, n) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshrund_n_s64(a, n) HEDLEY_STATIC_CAST(uint32_t, vqrshrund_n_s64((a), (n))) + #else + #define simde_vqrshrund_n_s64(a, n) vqrshrund_n_s64((a), (n)) + #endif #else - #define simde_vqrshrund_n_s64(a, n) simde_vqmovund_s64(simde_vrshrd_n_s64(a, n)) + #define simde_vqrshrund_n_s64(a, n) simde_vqmovund_s64(simde_vrshrd_n_s64((a), (n))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqrshrund_n_s64 #define vqrshrund_n_s64(a, n) simde_vqrshrund_n_s64((a), (n)) #endif +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshrunh_n_s16(a, n) HEDLEY_STATIC_CAST(uint8_t, vqrshrunh_n_s16((a), (n))) + #else + #define simde_vqrshrunh_n_s16(a, n) vqrshrunh_n_s16((a), (n)) + #endif +#else + #define simde_vqrshrunh_n_s16(a, n) simde_vqmovunh_s16(simde_x_vrshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrunh_n_s16 + #define vqrshrunh_n_s16(a, n) simde_vqrshrunh_n_s16((a), (n)) +#endif + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_vqrshrun_n_s16(a, n) vqrshrun_n_s16((a), (n)) #else diff --git a/arm/neon/qshl.h b/arm/neon/qshl.h index 279afe708..a01556ed8 100644 --- a/arm/neon/qshl.h +++ b/arm/neon/qshl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHL_H) @@ -186,22 +187,24 @@ simde_vqshlb_u8(uint8_t a, int8_t b) { #endif #else uint8_t r; + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); - if (b < -7) - b = -7; - - if (b <= 0) { - r = a >> -b; - } else if (b < 7) { - r = HEDLEY_STATIC_CAST(uint8_t, a << b); - if ((r >> b) != a) { - r = UINT8_MAX; - } - } else if (a == 0) { + if ((b8 <= -8) || (a == 0)) + { r = 0; - } else { + } + else if (b8 >= 8) + { r = UINT8_MAX; } + else if (b8 <= 0) { + r = a >> -b8; + } else { + r = HEDLEY_STATIC_CAST(uint8_t, a << b8); + if ((r >> b8) != a) { + r = UINT8_MAX; + } + } return r; #endif @@ -227,22 +230,24 @@ simde_vqshlh_u16(uint16_t a, int16_t b) { #endif #else uint16_t r; + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); - if (b < -15) - b = -15; - - if (b <= 0) { - r = a >> -b; - } else if (b < 15) { - r = HEDLEY_STATIC_CAST(uint16_t, a << b); - if ((r >> b) != a) { - r = UINT16_MAX; - } - } else if (a == 0) { + if ((b8 <= -16) || (a == 0)) + { r = 0; - } else { + } + else if (b8 >= 16) + { r = UINT16_MAX; } + else if (b8 <= 0) { + r = a >> -b8; + } else { + r = HEDLEY_STATIC_CAST(uint16_t, a << b8); + if ((r >> b8) != a) { + r = UINT16_MAX; + } + } return r; #endif @@ -268,22 +273,24 @@ simde_vqshls_u32(uint32_t a, int32_t b) { #endif #else uint32_t r; + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); - if (b < -31) - b = -31; - - if (b <= 0) { - r = HEDLEY_STATIC_CAST(uint32_t, a >> -b); - } else if (b < 31) { - r = a << b; - if ((r >> b) != a) { - r = UINT32_MAX; - } - } else if (a == 0) { + if ((b8 <= -32) || (a == 0)) + { r = 0; - } else { + } + else if (b8 >= 32) + { r = UINT32_MAX; } + else if (b8 <= 0) { + r = a >> -b8; + } else { + r = HEDLEY_STATIC_CAST(uint32_t, a << b8); + if ((r >> b8) != a) { + r = UINT32_MAX; + } + } return r; #endif @@ -309,28 +316,30 @@ simde_vqshld_u64(uint64_t a, int64_t b) { #endif #else uint64_t r; + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); - if (b < -63) - b = -63; - - if (b <= 0) { - r = a >> -b; - } else if (b < 63) { - r = HEDLEY_STATIC_CAST(uint64_t, a << b); - if ((r >> b) != a) { - r = UINT64_MAX; - } - } else if (a == 0) { + if ((b8 <= -64) || (a == 0)) + { r = 0; - } else { + } + else if (b8 >= 64) + { r = UINT64_MAX; } + else if (b8 <= 0) { + r = a >> -b8; + } else { + r = HEDLEY_STATIC_CAST(uint64_t, a << b8); + if ((r >> b8) != a) { + r = UINT64_MAX; + } + } return r; #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vqshldb_u64 + #undef vqshld_u64 #define vqshld_u64(a, b) simde_vqshld_u64((a), (b)) #endif diff --git a/arm/neon/qshl_n.h b/arm/neon/qshl_n.h new file mode 100644 index 000000000..e3d4c924d --- /dev/null +++ b/arm/neon/qshl_n.h @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHL_N_H) +#define SIMDE_ARM_NEON_QSHL_N_H + +#include "types.h" +#include "cls.h" +#include "qshl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vqshlb_n_s8(int8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + return simde_vqshlb_s8(a, HEDLEY_STATIC_CAST(int8_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlb_n_s8(a, n) vqshlb_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlb_n_s8 + #define vqshlb_n_s8(a, n) simde_vqshlb_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqshlh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + return simde_vqshlh_s16(a, HEDLEY_STATIC_CAST(int16_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlh_n_s16(a, n) vqshlh_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlh_n_s16 + #define vqshlh_n_s16(a, n) simde_vqshlh_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqshls_n_s32(int32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + return simde_vqshls_s32(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshls_n_s32(a, n) vqshls_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshls_n_s32 + #define vqshls_n_s32(a, n) simde_vqshls_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqshld_n_s64(int64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + return simde_vqshld_s64(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshld_n_s64(a, n) vqshld_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshld_n_s64 + #define vqshld_n_s64(a, n) simde_vqshld_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vqshlb_n_u8(uint8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + return simde_vqshlb_u8(a, HEDLEY_STATIC_CAST(int8_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlb_n_u8(a, n) vqshlb_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlb_n_u8 + #define vqshlb_n_u8(a, n) simde_vqshlb_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqshlh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + return simde_vqshlh_u16(a, HEDLEY_STATIC_CAST(int16_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlh_n_u16(a, n) vqshlh_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlh_n_u16 + #define vqshlh_n_u16(a, n) simde_vqshlh_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vqshls_n_u32(uint32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + return simde_vqshls_u32(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshls_n_u32(a, n) vqshls_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshls_n_u32 + #define vqshls_n_u32(a, n) simde_vqshls_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vqshld_n_u64(uint64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + return simde_vqshld_u64(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshld_n_u64(a, n) vqshld_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshld_n_u64 + #define vqshld_n_u64(a, n) simde_vqshld_n_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vqshl_n_s8 (const simde_int8x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_s8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s8(a, n) vqshl_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s8 + #define vqshl_n_s8(a, n) simde_vqshl_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqshl_n_s16 (const simde_int16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_s16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s16(a, n) vqshl_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s16 + #define vqshl_n_s16(a, n) simde_vqshl_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqshl_n_s32 (const simde_int32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_s32(a_.values[i], n); + } + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s32(a, n) vqshl_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s32 + #define vqshl_n_s32(a, n) simde_vqshl_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vqshl_n_s64 (const simde_int64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_int64x1_private + r_, + a_ = simde_int64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_s64(a_.values[i], n); + } + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s64(a, n) vqshl_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s64 + #define vqshl_n_s64(a, n) simde_vqshl_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vqshl_n_u8 (const simde_uint8x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_u8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u8(a, n) vqshl_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u8 + #define vqshl_n_u8(a, n) simde_vqshl_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vqshl_n_u16 (const simde_uint16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_u16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u16(a, n) vqshl_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u16 + #define vqshl_n_u16(a, n) simde_vqshl_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vqshl_n_u32 (const simde_uint32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_u32(a_.values[i], n); + } + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u32(a, n) vqshl_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u32 + #define vqshl_n_u32(a, n) simde_vqshl_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vqshl_n_u64 (const simde_uint64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_uint64x1_private + r_, + a_ = simde_uint64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_u64(a_.values[i], n); + } + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u64(a, n) vqshl_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u64 + #define vqshl_n_u64(a, n) simde_vqshl_n_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqshlq_n_s8 (const simde_int8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_s8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s8(a, n) vqshlq_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s8 + #define vqshlq_n_s8(a, n) simde_vqshlq_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqshlq_n_s16 (const simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_s16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s16(a, n) vqshlq_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s16 + #define vqshlq_n_s16(a, n) simde_vqshlq_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqshlq_n_s32 (const simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_s32(a_.values[i], n); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s32(a, n) vqshlq_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s32 + #define vqshlq_n_s32(a, n) simde_vqshlq_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqshlq_n_s64 (const simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_s64(a_.values[i], n); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s64(a, n) vqshlq_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s64 + #define vqshlq_n_s64(a, n) simde_vqshlq_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqshlq_n_u8 (const simde_uint8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_u8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u8(a, n) vqshlq_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u8 + #define vqshlq_n_u8(a, n) simde_vqshlq_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqshlq_n_u16 (const simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_u16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u16(a, n) vqshlq_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u16 + #define vqshlq_n_u16(a, n) simde_vqshlq_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqshlq_n_u32 (const simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_u32(a_.values[i], n); + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u32(a, n) vqshlq_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u32 + #define vqshlq_n_u32(a, n) simde_vqshlq_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vqshlq_n_u64 (const simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_u64(a_.values[i], n); + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u64(a, n) vqshlq_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u64 + #define vqshlq_n_u64(a, n) simde_vqshlq_n_u64((a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHL_N_H) */ diff --git a/arm/neon/qshlu_n.h b/arm/neon/qshlu_n.h index a39f6795a..43b0b3ee5 100644 --- a/arm/neon/qshlu_n.h +++ b/arm/neon/qshlu_n.h @@ -22,6 +22,8 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QSHLU_N_H) @@ -56,6 +58,22 @@ simde_vqshlub_n_s8(int8_t a, const int n) #define vqshlub_n_s8(a, n) simde_vqshlub_n_s8((a), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqshluh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + uint16_t r = HEDLEY_STATIC_CAST(uint16_t, a << n); + r |= (((r >> n) != HEDLEY_STATIC_CAST(uint16_t, a)) ? UINT16_MAX : 0); + return (a < 0) ? 0 : r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshluh_n_s16(a, n) HEDLEY_STATIC_CAST(uint16_t, vqshluh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshluh_n_s16 + #define vqshluh_n_s16(a, n) simde_vqshluh_n_s16((a), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vqshlus_n_s32(int32_t a, const int n) @@ -76,8 +94,8 @@ SIMDE_FUNCTION_ATTRIBUTES uint64_t simde_vqshlud_n_s64(int64_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { - uint32_t r = HEDLEY_STATIC_CAST(uint32_t, a << n); - r |= (((r >> n) != HEDLEY_STATIC_CAST(uint32_t, a)) ? UINT32_MAX : 0); + uint64_t r = HEDLEY_STATIC_CAST(uint64_t, a << n); + r |= (((r >> n) != HEDLEY_STATIC_CAST(uint64_t, a)) ? UINT64_MAX : 0); return (a < 0) ? 0 : r; } #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) @@ -105,8 +123,11 @@ simde_vqshlu_n_s8(simde_int8x8_t a, const int n) #else simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint8x8_private r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t shift = __riscv_vsll_vx_u8m1(__riscv_vreinterpret_v_i8m1_u8m1(a_.sv64), n, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(shift, UINT8_MAX, __riscv_vmsne_vv_u8m1_b8(__riscv_vsrl_vx_u8m1(shift, n, 8), __riscv_vreinterpret_v_i8m1_u8m1(a_.sv64), 8), 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, __riscv_vmslt_vx_i8m1_b8(a_.sv64, 0, 8), 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -151,8 +172,11 @@ simde_vqshlu_n_s16(simde_int16x4_t a, const int n) #else simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint16x4_private r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t shift = __riscv_vsll_vx_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(a_.sv64), n, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(shift, UINT16_MAX, __riscv_vmsne_vv_u16m1_b16(__riscv_vsrl_vx_u16m1(shift, n, 4), __riscv_vreinterpret_v_i16m1_u16m1(a_.sv64), 4), 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, 0, __riscv_vmslt_vx_i16m1_b16(a_.sv64, 0, 4), 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -200,7 +224,11 @@ simde_vqshlu_n_s32(simde_int32x2_t a, const int n) simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t shift = __riscv_vsll_vx_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(a_.sv64), n, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(shift, UINT32_MAX, __riscv_vmsne_vv_u32m1_b32(__riscv_vsrl_vx_u32m1(shift, n, 2), __riscv_vreinterpret_v_i32m1_u32m1(a_.sv64), 2), 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, 0, __riscv_vmslt_vx_i32m1_b32(a_.sv64, 0, 2), 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -247,7 +275,11 @@ simde_vqshlu_n_s64(simde_int64x1_t a, const int n) simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t shift = __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(a_.sv64), n, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(shift, UINT64_MAX, __riscv_vmsne_vv_u64m1_b64(__riscv_vsrl_vx_u64m1(shift, n, 1), __riscv_vreinterpret_v_i64m1_u64m1(a_.sv64), 1), 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, 0, __riscv_vmslt_vx_i64m1_b64(a_.sv64, 0, 1), 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -287,6 +319,10 @@ simde_vqshluq_n_s8(simde_int8x16_t a, const int n) const v128_t overflow = wasm_i8x16_ne(a_.v128, wasm_u8x16_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i8x16_shr(a_.v128, 7)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t shift = __riscv_vsll_vx_u8m1(__riscv_vreinterpret_v_i8m1_u8m1(a_.sv128), n, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(shift, UINT8_MAX, __riscv_vmsne_vv_u8m1_b8(__riscv_vsrl_vx_u8m1(shift, n, 16), __riscv_vreinterpret_v_i8m1_u8m1(a_.sv128), 16), 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, 0, __riscv_vmslt_vx_i8m1_b8(a_.sv128, 0, 16), 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -326,6 +362,10 @@ simde_vqshluq_n_s16(simde_int16x8_t a, const int n) const v128_t overflow = wasm_i16x8_ne(a_.v128, wasm_u16x8_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i16x8_shr(a_.v128, 15)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t shift = __riscv_vsll_vx_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(a_.sv128), n, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(shift, UINT16_MAX, __riscv_vmsne_vv_u16m1_b16(__riscv_vsrl_vx_u16m1(shift, n, 8), __riscv_vreinterpret_v_i16m1_u16m1(a_.sv128), 8), 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, 0, __riscv_vmslt_vx_i16m1_b16(a_.sv128, 0, 8), 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -365,6 +405,10 @@ simde_vqshluq_n_s32(simde_int32x4_t a, const int n) const v128_t overflow = wasm_i32x4_ne(a_.v128, wasm_u32x4_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i32x4_shr(a_.v128, 31)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t shift = __riscv_vsll_vx_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(a_.sv128), n, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(shift, UINT32_MAX, __riscv_vmsne_vv_u32m1_b32(__riscv_vsrl_vx_u32m1(shift, n, 4), __riscv_vreinterpret_v_i32m1_u32m1(a_.sv128), 4), 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, 0, __riscv_vmslt_vx_i32m1_b32(a_.sv128, 0, 4), 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -404,6 +448,10 @@ simde_vqshluq_n_s64(simde_int64x2_t a, const int n) const v128_t overflow = wasm_i64x2_ne(a_.v128, wasm_u64x2_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i64x2_shr(a_.v128, 63)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t shift = __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(a_.sv128), n, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(shift, UINT64_MAX, __riscv_vmsne_vv_u64m1_b64(__riscv_vsrl_vx_u64m1(shift, n, 2), __riscv_vreinterpret_v_i64m1_u64m1(a_.sv128), 2), 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, 0, __riscv_vmslt_vx_i64m1_b64(a_.sv128, 0, 2), 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; diff --git a/arm/neon/qshrn_high_n.h b/arm/neon/qshrn_high_n.h new file mode 100644 index 000000000..59e6d8d93 --- /dev/null +++ b/arm/neon/qshrn_high_n.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_QSHRN_HIGH_N_H + +#include "types.h" +#include "shr_n.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s16(r, a, n) vqshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s16(r, a, n) simde_vcombine_s8(r, simde_vqmovn_s16(simde_vshrq_n_s16(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s16 + #define vqshrn_high_n_s16(r, a, n) simde_vqshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s32(r, a, n) vqshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s32(r, a, n) simde_vcombine_s16(r, simde_vqmovn_s32(simde_vshrq_n_s32(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s32 + #define vqshrn_high_n_s32(r, a, n) simde_vqshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s64(r, a, n) vqshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s64(r, a, n) simde_vcombine_s32(r, simde_vqmovn_s64(simde_vshrq_n_s64(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s64 + #define vqshrn_high_n_s64(r, a, n) simde_vqshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u16(r, a, n) vqshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u16(r, a, n) simde_vcombine_u8(r, simde_vqmovn_u16(simde_vshrq_n_u16(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u16 + #define vqshrn_high_n_u16(r, a, n) simde_vqshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u32(r, a, n) vqshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u32(r, a, n) simde_vcombine_u16(r, simde_vqmovn_u32(simde_vshrq_n_u32(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u32 + #define vqshrn_high_n_u32(r, a, n) simde_vqshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u64(r, a, n) vqshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u64(r, a, n) simde_vcombine_u32(r, simde_vqmovn_u64(simde_vshrq_n_u64(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u64 + #define vqshrn_high_n_u64(r, a, n) simde_vqshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHRN_HIGH_N_H) */ diff --git a/arm/neon/qshrn_n.h b/arm/neon/qshrn_n.h index 93ab96c1f..abd47dcf7 100644 --- a/arm/neon/qshrn_n.h +++ b/arm/neon/qshrn_n.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHRN_N_H) @@ -36,6 +37,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrnh_n_s16(a, n) vqshrnh_n_s16(a, n) +#else + #define simde_vqshrnh_n_s16(a, n) simde_vqmovnh_s16(simde_x_vshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrnh_n_s16 + #define vqshrnh_n_s16(a, n) simde_vqshrnh_n_s16(a, n) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrnh_n_u16(a, n) vqshrnh_n_u16(a, n) +#else + #define simde_vqshrnh_n_u16(a, n) simde_vqmovnh_u16(simde_x_vshrh_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrnh_n_u16 + #define vqshrnh_n_u16(a, n) simde_vqshrnh_n_u16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqshrns_n_s32(a, n) vqshrns_n_s32(a, n) #else diff --git a/arm/neon/qshrun_high_n.h b/arm/neon/qshrun_high_n.h new file mode 100644 index 000000000..acea87463 --- /dev/null +++ b/arm/neon/qshrun_high_n.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHRUN_HIGH_N_H) +#define SIMDE_ARM_NEON_QSHRUN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = (a_.values[i]) >> n; + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s16(r, a, n) vqshrun_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) + #undef vqshrun_high_n_s16 + #define vqshrun_high_n_s16(r, a, n) simde_vqshrun_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> n); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s32(r, a, n) vqshrun_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) + #undef vqshrun_high_n_s32 + #define vqshrun_high_n_s32(r, a, n) simde_vqshrun_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> n); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s64(r, a, n) vqshrun_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) + #undef vqshrun_high_n_s64 + #define vqshrun_high_n_s64(r, a, n) simde_vqshrun_high_n_s64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHRUN_HIGH_N_H) */ diff --git a/arm/neon/qshrun_n.h b/arm/neon/qshrun_n.h index 4e1aa7395..77f8e6af6 100644 --- a/arm/neon/qshrun_n.h +++ b/arm/neon/qshrun_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHRUN_N_H) @@ -35,6 +36,16 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrunh_n_s16(a, n) HEDLEY_STATIC_CAST(uint8_t, vqshrunh_n_s16((a), (n))) +#else + #define simde_vqshrunh_n_s16(a, n) simde_vqmovunh_s16(simde_x_vshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrunh_n_s16 + #define vqshrunh_n_s16(a, n) simde_vqshrunh_n_s16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqshruns_n_s32(a, n) HEDLEY_STATIC_CAST(uint16_t, vqshruns_n_s32((a), (n))) #else diff --git a/arm/neon/qsub.h b/arm/neon/qsub.h index 0c3e375c1..87213b43e 100644 --- a/arm/neon/qsub.h +++ b/arm/neon/qsub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QSUB_H) @@ -134,6 +135,8 @@ simde_vqsub_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pi8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT8_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -168,6 +171,8 @@ simde_vqsub_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pi16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT16_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -200,7 +205,9 @@ simde_vqsub_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT32_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -232,7 +239,9 @@ simde_vqsub_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT64_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -266,6 +275,8 @@ simde_vqsub_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pu8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -297,6 +308,8 @@ simde_vqsub_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pu16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -326,7 +339,9 @@ simde_vqsub_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else @@ -355,7 +370,9 @@ simde_vqsub_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else @@ -390,6 +407,8 @@ simde_vqsubq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.v128 = wasm_i8x16_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epi8(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i8m1(a_.sv128 , b_.sv128 , 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT8_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -428,6 +447,8 @@ simde_vqsubq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.v128 = wasm_i16x8_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epi16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i16m1(a_.sv128 , b_.sv128 , 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT16_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -479,6 +500,8 @@ simde_vqsubq_s32(simde_int32x4_t a, simde_int32x4_t b) { #else r_.m128i = _mm_xor_si128(diff, _mm_and_si128(t, _mm_srai_epi32(t, 31))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i32m1(a_.sv128 , b_.sv128 , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT32_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -511,7 +534,9 @@ simde_vqsubq_s64(simde_int64x2_t a, simde_int64x2_t b) { a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i64m1(a_.sv128 , b_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT64_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -549,6 +574,8 @@ simde_vqsubq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.v128 = wasm_u8x16_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epu8(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u8m1(a_.sv128 , b_.sv128 , 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values <= a_.values); @@ -584,6 +611,8 @@ simde_vqsubq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.v128 = wasm_u16x8_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epu16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u16m1(a_.sv128 , b_.sv128 , 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values <= a_.values); @@ -629,6 +658,8 @@ simde_vqsubq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { _mm_set1_epi32(~INT32_C(0)) ) ); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u32m1(a_.sv128 , b_.sv128 , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -661,7 +692,9 @@ simde_vqsubq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u64m1(a_.sv128 , b_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else diff --git a/arm/neon/qtbl.h b/arm/neon/qtbl.h index 1b7c3b3cd..066278a93 100644 --- a/arm/neon/qtbl.h +++ b/arm/neon/qtbl.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QTBL_H) @@ -40,6 +42,10 @@ simde_uint8x8_t simde_vqtbl1_u8(simde_uint8x16_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl1_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbl2_u8(split, idx); #else simde_uint8x16_private t_ = simde_uint8x16_to_private(t); simde_uint8x8_private @@ -50,6 +56,10 @@ simde_vqtbl1_u8(simde_uint8x16_t t, simde_uint8x8_t idx) { __m128i idx128 = _mm_set1_epi64(idx_.m64); __m128i r128 = _mm_shuffle_epi8(t_.m128i, _mm_or_si128(idx128, _mm_cmpgt_epi8(idx128, _mm_set1_epi8(15)))); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv64, 16, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -86,6 +96,10 @@ simde_uint8x8_t simde_vqtbl2_u8(simde_uint8x16x2_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl2_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbl4_u8(split, idx); #else simde_uint8x16_private t_[2] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]) }; simde_uint8x8_private @@ -99,6 +113,14 @@ simde_vqtbl2_u8(simde_uint8x16x2_t t, simde_uint8x8_t idx) { __m128i r128_1 = _mm_shuffle_epi8(t_[1].m128i, idx128); __m128i r128 = _mm_blendv_epi8(r128_0, r128_1, _mm_slli_epi32(idx128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -135,6 +157,15 @@ simde_uint8x8_t simde_vqtbl3_u8(simde_uint8x16x3_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl3_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbl4_u8(split_lo, idx); + uint8x8_t hi = vtbl2_u8(split_hi, idx_hi); + return vorr_u8(lo, hi); #else simde_uint8x16_private t_[3] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]) }; @@ -151,6 +182,16 @@ simde_vqtbl3_u8(simde_uint8x16x3_t t, simde_uint8x8_t idx) { __m128i r128_2 = _mm_shuffle_epi8(t_[2].m128i, idx128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(idx128, 2)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -187,6 +228,15 @@ simde_uint8x8_t simde_vqtbl4_u8(simde_uint8x16x4_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl4_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbl4_u8(split_lo, idx); + uint8x8_t hi = vtbl4_u8(split_hi, idx_hi); + return vorr_u8(lo, hi); #else simde_uint8x16_private t_[4] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]), simde_uint8x16_to_private(t.val[3]) }; @@ -206,6 +256,18 @@ simde_vqtbl4_u8(simde_uint8x16x4_t t, simde_uint8x8_t idx) { __m128i r128_23 = _mm_blendv_epi8(r128_2, r128_3, idx128_shl3); __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(idx128, 2)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -244,6 +306,12 @@ simde_uint8x16_t simde_vqtbl1q_u8(simde_uint8x16_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl1q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbl2_u8(split, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_and(vec_perm(t, t, idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 16)))); #else @@ -256,6 +324,10 @@ simde_vqtbl1q_u8(simde_uint8x16_t t, simde_uint8x16_t idx) { r_.m128i = _mm_shuffle_epi8(t_.m128i, _mm_or_si128(idx_.m128i, _mm_cmpgt_epi8(idx_.m128i, _mm_set1_epi8(15)))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_swizzle(t_.v128, idx_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv128, 16, 16); + r_.sv128 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv128 , 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, 0, mask, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -292,6 +364,12 @@ simde_uint8x16_t simde_vqtbl2q_u8(simde_uint8x16x2_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl2q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbl4_u8(split, vget_low_u8(idx)); + uint8x8_t hi = vtbl4_u8(split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_and(vec_perm(t.val[0], t.val[1], idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 32)))); @@ -309,6 +387,14 @@ simde_vqtbl2q_u8(simde_uint8x16x2_t t, simde_uint8x16_t idx) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv128); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 16); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -345,6 +431,17 @@ simde_uint8x16_t simde_vqtbl3q_u8(simde_uint8x16x3_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl3q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi_lo = vtbl2_u8(split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi_hi = vtbl2_u8(split_hi, vget_high_u8(idx_hi)); + uint8x8_t lo = vtbx4_u8(hi_lo, split_lo, vget_low_u8(idx)); + uint8x8_t hi = vtbx4_u8(hi_hi, split_lo, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_2 = vec_perm(t.val[2], t.val[2], idx); @@ -368,6 +465,16 @@ simde_vqtbl3q_u8(simde_uint8x16x3_t t, simde_uint8x16_t idx) { r_.v128 = wasm_v128_or(wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -404,6 +511,17 @@ simde_uint8x16_t simde_vqtbl4q_u8(simde_uint8x16x4_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl4q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo_lo = vtbl4_u8(split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbl4_u8(split_lo, vget_high_u8(idx)); + uint8x8_t lo = vtbx4_u8(lo_lo, split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi = vtbx4_u8(lo_hi, split_hi, vget_high_u8(idx_hi)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_23 = vec_perm(t.val[2], t.val[3], idx); @@ -431,6 +549,18 @@ simde_vqtbl4q_u8(simde_uint8x16x4_t t, simde_uint8x16_t idx) { wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))), wasm_i8x16_swizzle(t_[3].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(48))))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -462,6 +592,130 @@ simde_vqtbl4q_s8(simde_int8x16x4_t t, simde_uint8x16_t idx) { #define vqtbl4q_s8(t, idx) simde_vqtbl4q_s8((t), (idx)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl1_p8(simde_poly8x16_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl1_p8(t, idx); + #else + return simde_vreinterpret_p8_u8(simde_vqtbl1_u8(simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl1_p8 + #define vqtbl1_p8(t, idx) simde_vqtbl1_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl1q_p8(simde_poly8x16_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl1q_p8(t, idx); + #else + return simde_vreinterpretq_p8_u8(simde_vqtbl1q_u8(simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl1q_p8 + #define vqtbl1q_p8(t, idx) simde_vqtbl1q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl2_p8(simde_poly8x16x2_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl2_p8(t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl2_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl2_p8 + #define vqtbl2_p8(t, idx) simde_vqtbl2_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl2q_p8(simde_poly8x16x2_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl2q_p8(t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl2q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl2q_p8 + #define vqtbl2q_p8(t, idx) simde_vqtbl2q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl3_p8(simde_poly8x16x3_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl3_p8(t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl3_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl3_p8 + #define vqtbl3_p8(t, idx) simde_vqtbl3_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl3q_p8(simde_poly8x16x3_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl3q_p8(t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl3q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl3q_p8 + #define vqtbl3q_p8(t, idx) simde_vqtbl3q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl4_p8(simde_poly8x16x4_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl4_p8(t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl4_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl4_p8 + #define vqtbl4_p8(t, idx) simde_vqtbl4_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl4q_p8(simde_poly8x16x4_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl4q_p8(t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl4q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl4q_p8 + #define vqtbl4q_p8(t, idx) simde_vqtbl4q_p8((t), (idx)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/qtbx.h b/arm/neon/qtbx.h index 5ba998fb1..326bf9f21 100644 --- a/arm/neon/qtbx.h +++ b/arm/neon/qtbx.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QTBX_H) @@ -40,6 +42,10 @@ simde_uint8x8_t simde_vqtbx1_u8(simde_uint8x8_t a, simde_uint8x16_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx1_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbx2_u8(a, split, idx); #else simde_uint8x16_private t_ = simde_uint8x16_to_private(t); simde_uint8x8_private @@ -53,6 +59,10 @@ simde_vqtbx1_u8(simde_uint8x8_t a, simde_uint8x16_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_shuffle_epi8(t_.m128i, idx128); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv64, 16, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_.sv64, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -89,6 +99,10 @@ simde_uint8x8_t simde_vqtbx2_u8(simde_uint8x8_t a, simde_uint8x16x2_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx2_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbx4_u8(a, split, idx); #else simde_uint8x16_private t_[2] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]) }; simde_uint8x8_private @@ -104,6 +118,15 @@ simde_vqtbx2_u8(simde_uint8x8_t a, simde_uint8x16x2_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_0, r128_1, _mm_slli_epi32(idx128, 3)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2(a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -140,6 +163,14 @@ simde_uint8x8_t simde_vqtbx3_u8(simde_uint8x8_t a, simde_uint8x16x3_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx3_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi = vtbx2_u8(a, split_hi, idx_hi); + return vtbx4_u8(hi, split_lo, idx); #else simde_uint8x16_private t_[3] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]) }; simde_uint8x8_private @@ -157,6 +188,17 @@ simde_vqtbx3_u8(simde_uint8x8_t a, simde_uint8x16x3_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(idx128, 2)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv64); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -193,6 +235,14 @@ simde_uint8x8_t simde_vqtbx4_u8(simde_uint8x8_t a, simde_uint8x16x4_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx4_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbx4_u8(a, split_lo, idx); + return vtbx4_u8(lo, split_hi, idx_hi); #else simde_uint8x16_private t_[4] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]), simde_uint8x16_to_private(t.val[3]) }; simde_uint8x8_private @@ -213,6 +263,19 @@ simde_vqtbx4_u8(simde_uint8x8_t a, simde_uint8x16x4_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(idx128, 2)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv64); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -251,6 +314,12 @@ simde_uint8x16_t simde_vqtbx1q_u8(simde_uint8x16_t a, simde_uint8x16_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx1q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbx2_u8(vget_low_u8(a), split, vget_low_u8(idx)); + uint8x8_t hi = vtbx2_u8(vget_high_u8(a), split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_sel(a, vec_perm(t, t, idx), @@ -268,6 +337,10 @@ simde_vqtbx1q_u8(simde_uint8x16_t a, simde_uint8x16_t t, simde_uint8x16_t idx) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_or(wasm_i8x16_swizzle(t_.v128, idx_.v128), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(15)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv128, 16, 16); + r_.sv128 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv128 , 16); + r_.sv128 = __riscv_vmerge_vvm_u8m1(r_.sv128, a_.sv128, mask, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -304,6 +377,12 @@ simde_uint8x16_t simde_vqtbx2q_u8(simde_uint8x16_t a, simde_uint8x16x2_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx2q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbx4_u8(vget_low_u8(a), split, vget_low_u8(idx)); + uint8x8_t hi = vtbx4_u8(vget_high_u8(a), split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_sel(a, vec_perm(t.val[0], t.val[1], idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 32)))); @@ -324,6 +403,15 @@ simde_vqtbx2q_u8(simde_uint8x16_t a, simde_uint8x16x2_t t, simde_uint8x16_t idx) r_.v128 = wasm_v128_or(wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(31)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv128); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 16); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -360,6 +448,17 @@ simde_uint8x16_t simde_vqtbx3q_u8(simde_uint8x16_t a, simde_uint8x16x3_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx3q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi_lo = vtbx2_u8(vget_low_u8(a), split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi_hi = vtbx2_u8(vget_high_u8(a), split_hi, vget_high_u8(idx_hi)); + uint8x8_t lo_lo = vtbx4_u8(hi_lo, split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbx4_u8(hi_hi, split_lo, vget_high_u8(idx)); + return vcombine_u8(lo_lo, lo_hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_2 = vec_perm(t.val[2], t.val[2], idx); @@ -386,6 +485,17 @@ simde_vqtbx3q_u8(simde_uint8x16_t a, simde_uint8x16x3_t t, simde_uint8x16_t idx) wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))) , wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(47))))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -422,6 +532,17 @@ simde_uint8x16_t simde_vqtbx4q_u8(simde_uint8x16_t a, simde_uint8x16x4_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx4q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo_lo = vtbx4_u8(vget_low_u8(a), split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbx4_u8(vget_high_u8(a), split_lo, vget_high_u8(idx)); + uint8x8_t lo = vtbx4_u8(lo_lo, split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi = vtbx4_u8(lo_hi, split_hi, vget_high_u8(idx_hi)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_23 = vec_perm(t.val[2], t.val[3], idx); @@ -452,6 +573,19 @@ simde_vqtbx4q_u8(simde_uint8x16_t a, simde_uint8x16x4_t t, simde_uint8x16_t idx) wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))), wasm_i8x16_swizzle(t_[3].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(48))))), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(63)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -483,6 +617,130 @@ simde_vqtbx4q_s8(simde_int8x16_t a, simde_int8x16x4_t t, simde_uint8x16_t idx) { #define vqtbx4q_s8(a, t, idx) simde_vqtbx4q_s8((a), (t), (idx)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx1_p8(simde_poly8x8_t a, simde_poly8x16_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx1_p8(a, t, idx); + #else + return simde_vreinterpret_p8_u8(simde_vqtbx1_u8(simde_vreinterpret_u8_p8(a), simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx1_p8 + #define vqtbx1_p8(a, t, idx) simde_vqtbx1_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx1q_p8(simde_poly8x16_t a, simde_poly8x16_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx1q_p8(a, t, idx); + #else + return simde_vreinterpretq_p8_u8(simde_vqtbx1q_u8(simde_vreinterpretq_u8_p8(a), simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx1q_p8 + #define vqtbx1q_p8(a, t, idx) simde_vqtbx1q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx2_p8(simde_poly8x8_t a, simde_poly8x16x2_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx2_p8(a, t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx2_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx2_p8 + #define vqtbx2_p8(a, t, idx) simde_vqtbx2_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx2q_p8(simde_poly8x16_t a, simde_poly8x16x2_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx2q_p8(a, t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx2q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx2q_p8 + #define vqtbx2q_p8(a, t, idx) simde_vqtbx2q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx3_p8(simde_poly8x8_t a, simde_poly8x16x3_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx3_p8(a, t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx3_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx3_p8 + #define vqtbx3_p8(a, t, idx) simde_vqtbx3_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx3q_p8(simde_poly8x16_t a, simde_poly8x16x3_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx3q_p8(a, t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx3q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx3q_p8 + #define vqtbx3q_p8(a, t, idx) simde_vqtbx3q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx4_p8(simde_poly8x8_t a, simde_poly8x16x4_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx4_p8(a, t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx4_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx4_p8 + #define vqtbx4_p8(a, t, idx) simde_vqtbx4_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx4q_p8(simde_poly8x16_t a, simde_poly8x16x4_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx4q_p8(a, t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx4q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx4q_p8 + #define vqtbx4q_p8(a, t, idx) simde_vqtbx4q_p8((a), (t), (idx)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/raddhn.h b/arm/neon/raddhn.h new file mode 100644 index 000000000..0f16e446e --- /dev/null +++ b/arm/neon/raddhn.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RADDHN_H) +#define SIMDE_ARM_NEON_RADDHN_H + +#include "add.h" +#include "shr_n.h" +#include "movn.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vraddhn_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + int16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i] + round_cast; + } + return simde_vmovn_s16(simde_vshrq_n_s16(simde_int16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s16 + #define vraddhn_s16(a, b) simde_vraddhn_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vraddhn_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + int round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i] + round_cast; + } + return simde_vmovn_s32(simde_vshrq_n_s32(simde_int32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s32 + #define vraddhn_s32(a, b) simde_vraddhn_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vraddhn_s64(simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + int64_t round_cast = 1ll << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] + b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s64 + #define vraddhn_s64(a, b) simde_vraddhn_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vraddhn_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + uint16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] + b_.values[i] + round_cast); + } + return simde_vmovn_u16(simde_vshrq_n_u16(simde_uint16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u16 + #define vraddhn_u16(a, b) simde_vraddhn_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vraddhn_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i] + b_.values[i] + round_cast); + } + return simde_vmovn_u32(simde_vshrq_n_u32(simde_uint32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u32 + #define vraddhn_u32(a, b) simde_vraddhn_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vraddhn_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + uint64_t round_cast = 1ull << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] + b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u64 + #define vraddhn_u64(a, b) simde_vraddhn_u64((a), (b)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RADDHN_H) */ diff --git a/arm/neon/raddhn_high.h b/arm/neon/raddhn_high.h new file mode 100644 index 000000000..dc911698c --- /dev/null +++ b/arm/neon/raddhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RADDHN_HIGH_H) +#define SIMDE_ARM_NEON_RADDHN_HIGH_H + +#include "raddhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s16(r, a, b) vraddhn_high_s16((r), (a), (b)) +#else + #define simde_vraddhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vraddhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s16 + #define vraddhn_high_s16(r, a, b) simde_vraddhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s32(r, a, b) vraddhn_high_s32((r), (a), (b)) +#else + #define simde_vraddhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vraddhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s32 + #define vraddhn_high_s32(r, a, b) simde_vraddhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s64(r, a, b) vraddhn_high_s64((r), (a), (b)) +#else + #define simde_vraddhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vraddhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s64 + #define vraddhn_high_s64(r, a, b) simde_vraddhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u16(r, a, b) vraddhn_high_u16((r), (a), (b)) +#else + #define simde_vraddhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vraddhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u16 + #define vraddhn_high_u16(r, a, b) simde_vraddhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u32(r, a, b) vraddhn_high_u32((r), (a), (b)) +#else + #define simde_vraddhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vraddhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u32 + #define vraddhn_high_u32(r, a, b) simde_vraddhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u64(r, a, b) vraddhn_high_u64((r), (a), (b)) +#else + #define simde_vraddhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vraddhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u64 + #define vraddhn_high_u64(r, a, b) simde_vraddhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RADDHN_HIGH_H) */ diff --git a/arm/neon/rax.h b/arm/neon/rax.h new file mode 100644 index 000000000..a83576a48 --- /dev/null +++ b/arm/neon/rax.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RAX_H) +#define SIMDE_ARM_NEON_RAX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vrax1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return vrax1q_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = (b_.values[i] >> 63) | (b_.values[i] << 1); + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) + #undef vrax1q_u64 + #define vrax1q_u64(a, b) simde_vrax1q_u64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RAX_H) */ diff --git a/arm/neon/rbit.h b/arm/neon/rbit.h index c507df720..ce63117c1 100644 --- a/arm/neon/rbit.h +++ b/arm/neon/rbit.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* The GFNI implementation is based on Wojciech Muła's work at @@ -61,6 +63,13 @@ simde_vrbit_u8(simde_uint8x8_t a) { a_.m64 = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a_.m64, 2)), _mm_and_si64(mask, _mm_srli_pi16(a_.m64, 2))); mask = _mm_set1_pi8(0x0F); r_.m64 = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a_.m64, 4)), _mm_and_si64(mask, _mm_srli_pi16(a_.m64, 4))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t mask; + mask = __riscv_vmv_v_x_u8m1(0x55 , 8); + a_.sv64 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv64 , 1 , 8) , 8) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv64 , 8) , 1 , 8) , 8); + mask = __riscv_vmv_v_x_u8m1(0x33 , 8); + a_.sv64 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv64 , 2 , 8) , 8) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv64 , 8) , 2 , 8) , 8); + r_.sv64 = __riscv_vor_vv_u8m1(__riscv_vsrl_vx_u8m1(a_.sv64 , 4 , 8) , __riscv_vsll_vx_u8m1(a_.sv64 , 4 , 8) , 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -126,6 +135,13 @@ simde_vrbitq_u8(simde_uint8x16_t a) { a_.v128 = wasm_v128_bitselect(wasm_u8x16_shr(a_.v128, 1), wasm_i8x16_shl(a_.v128, 1), wasm_i8x16_splat(0x55)); a_.v128 = wasm_v128_bitselect(wasm_u8x16_shr(a_.v128, 2), wasm_i8x16_shl(a_.v128, 2), wasm_i8x16_splat(0x33)); r_.v128 = wasm_v128_or(wasm_u8x16_shr(a_.v128, 4), wasm_i8x16_shl(a_.v128, 4)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t mask; + mask = __riscv_vmv_v_x_u8m1(0x55 , 16); + a_.sv128 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv128 , 1 , 16) , 16) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv128 , 16) , 1 , 16) , 16); + mask = __riscv_vmv_v_x_u8m1(0x33 , 16); + a_.sv128 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv128 , 2 , 16) , 16) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv128 , 16) , 2 , 16) , 16); + r_.sv128 = __riscv_vor_vv_u8m1(__riscv_vsrl_vx_u8m1(a_.sv128 , 4 , 16) , __riscv_vsll_vx_u8m1(a_.sv128 , 4 , 16) , 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -159,6 +175,34 @@ simde_vrbitq_s8(simde_int8x16_t a) { #define vrbitq_s8(a) simde_vrbitq_s8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrbit_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrbit_p8(a); + #else + return simde_vreinterpret_p8_u8(simde_vrbit_u8(simde_vreinterpret_u8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrbit_p8 + #define vrbit_p8(a) simde_vrbit_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrbitq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrbitq_p8(a); + #else + return simde_vreinterpretq_p8_u8(simde_vrbitq_u8(simde_vreinterpretq_u8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrbitq_p8 + #define vrbitq_p8(a) simde_vrbitq_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/recpe.h b/arm/neon/recpe.h index ed9ef4254..b7a8b11f8 100644 --- a/arm/neon/recpe.h +++ b/arm/neon/recpe.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_RECPE_H) @@ -34,6 +36,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpeh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpeh_f16(a); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + r_ = 1.0f / a_; + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpeh_f16 + #define vrecpeh_f16(a) simde_vrecpeh_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrecpes_f32(simde_float32_t a) { @@ -62,6 +81,33 @@ simde_vrecped_f64(simde_float64_t a) { #define vrecped_f64(a) simde_vrecped_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrecpe_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpe_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfrec7_v_f16m1(a_.sv64 , 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpe_f16 + #define vrecpe_f16(a) simde_vrecpe_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrecpe_f32(simde_float32x2_t a) { @@ -72,7 +118,9 @@ simde_vrecpe_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - #if defined(SIMDE_IEEE754_STORAGE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfrec7_v_f32m1(a_.sv64 , 2); + #elif defined(SIMDE_IEEE754_STORAGE) /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -111,7 +159,9 @@ simde_vrecpe_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfrec7_v_f64m1(a_.sv64 , 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = 1.0 / a_.values; #else SIMDE_VECTORIZE @@ -138,7 +188,9 @@ simde_vrecpeq_f64(simde_float64x2_t a) { r_, a_ = simde_float64x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfrec7_v_f64m1(a_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = 1.0 / a_.values; #else SIMDE_VECTORIZE @@ -167,8 +219,11 @@ simde_vrecpeq_f32(simde_float32x4_t a) { r_, a_ = simde_float32x4_to_private(a); + #if defined(SIMDE_X86_SSE_NATIVE) r_.m128 = _mm_rcp_ps(a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfrec7_v_f32m1(a_.sv128 , 4); #elif defined(SIMDE_IEEE754_STORAGE) /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ SIMDE_VECTORIZE @@ -198,6 +253,33 @@ simde_vrecpeq_f32(simde_float32x4_t a) { #define vrecpeq_f32(a) simde_vrecpeq_f32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrecpeq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpeq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfrec7_v_f16m1(a_.sv128 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + #endif + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpeq_f16 + #define vrecpeq_f16(a) simde_vrecpeq_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vrecpe_u32(simde_uint32x2_t a){ @@ -210,7 +292,7 @@ simde_vrecpe_u32(simde_uint32x2_t a){ SIMDE_VECTORIZE for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - if(a_.values[i] <= 0x7FFFFFFF){ + if (a_.values[i] <= 0x7FFFFFFF){ r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; @@ -241,7 +323,7 @@ simde_vrecpeq_u32(simde_uint32x4_t a){ SIMDE_VECTORIZE for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - if(a_.values[i] <= 0x7FFFFFFF){ + if (a_.values[i] <= 0x7FFFFFFF){ r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; diff --git a/arm/neon/recps.h b/arm/neon/recps.h index 85c4f1052..d0f06fcd4 100644 --- a/arm/neon/recps.h +++ b/arm/neon/recps.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RECPS_H) @@ -35,6 +36,21 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpsh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpsh_f16(a, b); + #else + return simde_float16_from_float32(SIMDE_FLOAT32_C(2.0) - + simde_float16_to_float32(a) * simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpsh_f16 + #define vrecpsh_f16(a, b) simde_vrecpsh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrecpss_f32(simde_float32_t a, simde_float32_t b) { @@ -77,6 +93,30 @@ simde_vrecps_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vrecps_f64(a, b) simde_vrecps_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrecps_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecps_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpsh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecps_f16 + #define vrecps_f16(a, b) simde_vrecps_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrecps_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -119,6 +159,30 @@ simde_vrecpsq_f32(simde_float32x4_t a, simde_float32x4_t b) { #define vrecpsq_f32(a, b) simde_vrecpsq_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrecpsq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpsq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpsh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpsq_f16 + #define vrecpsq_f16(a, b) simde_vrecpsq_f16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP #endif /* !defined(SIMDE_ARM_NEON_RECPS_H) */ diff --git a/arm/neon/recpx.h b/arm/neon/recpx.h new file mode 100644 index 000000000..fede73dd9 --- /dev/null +++ b/arm/neon/recpx.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RECPX_H) +#define SIMDE_ARM_NEON_RECPX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpxh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpxh_f16(a); + #else + if (simde_isnanhf(a)) { + return SIMDE_NANHF; + } + uint16_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint16_t sign = n & 0x8000; + uint16_t exp = n & 0x7c00; + uint16_t result; + if (exp == 0) { + uint16_t max_exp = 0x7b00; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7c00; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrecpxh_f16 + #define vrecpxh_f16(a) simde_vrecpxh_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vrecpxs_f32(simde_float32_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrecpxs_f32(a); + #else + if (simde_math_isnanf(a)) { + return SIMDE_MATH_NANF; + } + uint32_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint32_t sign = n & 0x80000000; + uint32_t exp = n & 0x7f800000; + uint32_t result; + if (exp == 0) { + uint32_t max_exp = 0x7f000000; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7f800000; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpxs_f32 + #define vrecpxs_f32(a) simde_vrecpxs_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vrecpxd_f64(simde_float64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrecpxd_f64(a); + #else + if (simde_math_isnan(a)) { + return SIMDE_MATH_NAN; + } + uint64_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint64_t sign = n & 0x8000000000000000ull; + uint64_t exp = n & 0x7ff0000000000000ull; + uint64_t result; + if (exp == 0) { + uint64_t max_exp = 0x7fe0000000000000ull; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7ff0000000000000ull; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpxd_f64 + #define vrecpxd_f64(a) simde_vrecpxd_f64((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP +#endif /* !defined(SIMDE_ARM_NEON_RECPX_H) */ diff --git a/arm/neon/reinterpret.h b/arm/neon/reinterpret.h index 88bddbe6d..b91c3271a 100644 --- a/arm/neon/reinterpret.h +++ b/arm/neon/reinterpret.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ @@ -1696,7 +1697,8 @@ simde_vreinterpret_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_u16_f16 #define vreinterpret_u16_f16(a) simde_vreinterpret_u16_f16(a) #endif @@ -2172,7 +2174,8 @@ simde_vreinterpretq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_u16_f16 #define vreinterpretq_u16_f16(a) simde_vreinterpretq_u16_f16(a) #endif @@ -2330,6 +2333,24 @@ simde_vreinterpret_u64_u32(simde_uint32x2_t a) { #define vreinterpret_u64_u32 simde_vreinterpret_u64_u32 #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u64_f16(a); + #else + simde_uint64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_u64_f16 + #define vreinterpret_u64_f16 simde_vreinterpret_u64_f16 +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint64x1_t simde_vreinterpret_u64_f32(simde_float32x2_t a) { @@ -2631,7 +2652,8 @@ simde_vreinterpret_f16_u16(simde_uint16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_u16 #define vreinterpret_f16_u16(a) simde_vreinterpret_f16_u16(a) #endif @@ -2670,6 +2692,7 @@ simde_vreinterpret_f32_u64(simde_uint64x1_t a) { #define vreinterpret_f32_u64 simde_vreinterpret_f32_u64 #endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vreinterpret_f32_f64(simde_float64x1_t a) { @@ -2801,7 +2824,8 @@ simde_vreinterpretq_f16_u16(simde_uint16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_u16 #define vreinterpretq_f16_u16(a) simde_vreinterpretq_f16_u16(a) #endif @@ -3163,6 +3187,4464 @@ simde_vreinterpretq_f64_f32(simde_float32x4_t a) { #define vreinterpretq_f64_f32(a) simde_vreinterpretq_f64_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_f32(a); + #else + simde_float16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_f32 + #define vreinterpret_f16_f32 simde_vreinterpret_f16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s16(a); + #else + simde_float16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_s16 + #define vreinterpret_f16_s16 simde_vreinterpret_f16_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s32(a); + #else + simde_float16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_s32 + #define vreinterpret_f16_s32 simde_vreinterpret_f16_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s64(a); + #else + simde_float16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_s64 + #define vreinterpret_f16_s64 simde_vreinterpret_f16_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s8(a); + #else + simde_float16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_s8 + #define vreinterpret_f16_s8 simde_vreinterpret_f16_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u32(a); + #else + simde_float16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_u32 + #define vreinterpret_f16_u32 simde_vreinterpret_f16_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u64(a); + #else + simde_float16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_u64 + #define vreinterpret_f16_u64 simde_vreinterpret_f16_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u8(a); + #else + simde_float16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_u8 + #define vreinterpret_f16_u8 simde_vreinterpret_f16_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_f32(a); + #else + simde_float16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_f32 + #define vreinterpretq_f16_f32(a) simde_vreinterpretq_f16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s16(a); + #else + simde_float16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_s16 + #define vreinterpretq_f16_s16(a) simde_vreinterpretq_f16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s32(a); + #else + simde_float16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_s32 + #define vreinterpretq_f16_s32(a) simde_vreinterpretq_f16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s64(a); + #else + simde_float16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_s64 + #define vreinterpretq_f16_s64(a) simde_vreinterpretq_f16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s8(a); + #else + simde_float16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_s8 + #define vreinterpretq_f16_s8(a) simde_vreinterpretq_f16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u32(a); + #else + simde_float16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_u32 + #define vreinterpretq_f16_u32(a) simde_vreinterpretq_f16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u64(a); + #else + simde_float16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_u64 + #define vreinterpretq_f16_u64(a) simde_vreinterpretq_f16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u8(a); + #else + simde_float16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_u8 + #define vreinterpretq_f16_u8(a) simde_vreinterpretq_f16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_f64(a); + #else + simde_float16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_f64 + #define vreinterpret_f16_f64 simde_vreinterpret_f16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_f64(a); + #else + simde_float16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_f64 + #define vreinterpretq_f16_f64(a) simde_vreinterpretq_f16_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f32_f16(a); + #else + simde_float32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f32_f16 + #define vreinterpret_f32_f16 simde_vreinterpret_f32_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f32_f16(a); + #else + simde_float32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f32_f16 + #define vreinterpretq_f32_f16 simde_vreinterpretq_f32_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f64_f16(a); + #else + simde_float64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f64_f16 + #define vreinterpret_f64_f16 simde_vreinterpret_f64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f64_f16(a); + #else + simde_float64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f64_f16 + #define vreinterpretq_f64_f16 simde_vreinterpretq_f64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u8_f16(a); + #else + simde_uint8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_u8_f16 + #define vreinterpret_u8_f16(a) simde_vreinterpret_u8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u8_f16(a); + #else + simde_uint8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_u8_f16 + #define vreinterpretq_u8_f16(a) simde_vreinterpretq_u8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s8_f16(a); + #else + simde_int8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_s8_f16 + #define vreinterpret_s8_f16(a) simde_vreinterpret_s8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s8_f16(a); + #else + simde_int8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_s8_f16 + #define vreinterpretq_s8_f16(a) simde_vreinterpretq_s8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s16_f16(a); + #else + simde_int16x4_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_s16_f16 + #define vreinterpret_s16_f16(a) simde_vreinterpret_s16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s16_f16(a); + #else + simde_int16x8_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_s16_f16 + #define vreinterpretq_s16_f16(a) simde_vreinterpretq_s16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s32_f16(a); + #else + simde_int32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_s32_f16 + #define vreinterpret_s32_f16(a) simde_vreinterpret_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s32_f16(a); + #else + simde_int32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_s32_f16 + #define vreinterpretq_s32_f16(a) simde_vreinterpretq_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s64_f16(a); + #else + simde_int64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_s64_f16 + #define vreinterpret_s64_f16(a) simde_vreinterpret_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s64_f16(a); + #else + simde_int64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_s64_f16 + #define vreinterpretq_s64_f16(a) simde_vreinterpretq_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u32_f16(a); + #else + simde_uint32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_u32_f16 + #define vreinterpret_u32_f16(a) simde_vreinterpret_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u32_f16(a); + #else + simde_uint32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_u32_f16 + #define vreinterpretq_u32_f16(a) simde_vreinterpretq_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u64_f16(a); + #else + simde_uint64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_u64_f16 + #define vreinterpretq_u64_f16 simde_vreinterpretq_u64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s8(a); + #else + simde_poly8x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s8 + #define vreinterpret_p8_s8 simde_vreinterpret_p8_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s16(a); + #else + simde_poly8x8_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s16 + #define vreinterpret_p8_s16 simde_vreinterpret_p8_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s32(a); + #else + simde_poly8x8_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s32 + #define vreinterpret_p8_s32 simde_vreinterpret_p8_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s64(a); + #else + simde_poly8x8_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s64 + #define vreinterpret_p8_s64 simde_vreinterpret_p8_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_p16(a); + #else + simde_poly8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_p16 + #define vreinterpret_p8_p16 simde_vreinterpret_p8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p8_p64(a); + #else + simde_poly8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_p64 + #define vreinterpret_p8_p64 simde_vreinterpret_p8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_f32(a); + #else + simde_poly8x8_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_f32 + #define vreinterpret_p8_f32 simde_vreinterpret_p8_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p8_f64(a); + #else + simde_poly8x8_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_f64 + #define vreinterpret_p8_f64 simde_vreinterpret_p8_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s8(a); + #else + simde_poly8x16_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s8 + #define vreinterpretq_p8_s8(a) simde_vreinterpretq_p8_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s16(a); + #else + simde_poly8x16_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s16 + #define vreinterpretq_p8_s16(a) simde_vreinterpretq_p8_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s32(a); + #else + simde_poly8x16_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s32 + #define vreinterpretq_p8_s32(a) simde_vreinterpretq_p8_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s64(a); + #else + simde_poly8x16_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s64 + #define vreinterpretq_p8_s64(a) simde_vreinterpretq_p8_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_p16(a); + #else + simde_poly8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_p16 + #define vreinterpretq_p8_p16(a) simde_vreinterpretq_p8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p8_p64(a); + #else + simde_poly8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_p64 + #define vreinterpretq_p8_p64(a) simde_vreinterpretq_p8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_f32(a); + #else + simde_poly8x16_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_f32 + #define vreinterpretq_p8_f32(a) simde_vreinterpretq_p8_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p8_f64(a); + #else + simde_poly8x16_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_f64 + #define vreinterpretq_p8_f64(a) simde_vreinterpretq_p8_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s8(a); + #else + simde_poly16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s8 + #define vreinterpret_p16_s8 simde_vreinterpret_p16_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s16(a); + #else + simde_poly16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s16 + #define vreinterpret_p16_s16 simde_vreinterpret_p16_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s32(a); + #else + simde_poly16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s32 + #define vreinterpret_p16_s32 simde_vreinterpret_p16_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s64(a); + #else + simde_poly16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s64 + #define vreinterpret_p16_s64 simde_vreinterpret_p16_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_p8(a); + #else + simde_poly16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_p8 + #define vreinterpret_p16_p8 simde_vreinterpret_p16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p16_p64(a); + #else + simde_poly16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_p64 + #define vreinterpret_p16_p64 simde_vreinterpret_p16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p16_f16(a); + #else + simde_poly16x4_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_p16_f16 + #define vreinterpret_p16_f16(a) simde_vreinterpret_p16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_f32(a); + #else + simde_poly16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_f32 + #define vreinterpret_p16_f32 simde_vreinterpret_p16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p16_f64(a); + #else + simde_poly16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_f64 + #define vreinterpret_p16_f64 simde_vreinterpret_p16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s8(a); + #else + simde_poly16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s8 + #define vreinterpretq_p16_s8(a) simde_vreinterpretq_p16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s16(a); + #else + simde_poly16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s16 + #define vreinterpretq_p16_s16(a) simde_vreinterpretq_p16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s32(a); + #else + simde_poly16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s32 + #define vreinterpretq_p16_s32(a) simde_vreinterpretq_p16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s64(a); + #else + simde_poly16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s64 + #define vreinterpretq_p16_s64(a) simde_vreinterpretq_p16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_p8(a); + #else + simde_poly16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_p8 + #define vreinterpretq_p16_p8(a) simde_vreinterpretq_p16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p16_p64(a); + #else + simde_poly16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_p64 + #define vreinterpretq_p16_p64(a) simde_vreinterpretq_p16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_f32(a); + #else + simde_poly16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_f32 + #define vreinterpretq_p16_f32(a) simde_vreinterpretq_p16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p16_f64(a); + #else + simde_poly16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_f64 + #define vreinterpretq_p16_f64(a) simde_vreinterpretq_p16_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p16_f16(a); + #else + simde_poly16x8_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_p16_f16 + #define vreinterpretq_p16_f16(a) simde_vreinterpretq_p16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s8(a); + #else + simde_poly64x1_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s8 + #define vreinterpret_p64_s8 simde_vreinterpret_p64_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s16(a); + #else + simde_poly64x1_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s16 + #define vreinterpret_p64_s16 simde_vreinterpret_p64_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s32(a); + #else + simde_poly64x1_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s32 + #define vreinterpret_p64_s32 simde_vreinterpret_p64_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_p8(a); + #else + simde_poly64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_p8 + #define vreinterpret_p64_p8 simde_vreinterpret_p64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_p16(a); + #else + simde_poly64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_p16 + #define vreinterpret_p64_p16 simde_vreinterpret_p64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p64_f16(a); + #else + simde_poly64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_p64_f16 + #define vreinterpret_p64_f16 simde_vreinterpret_p64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_f32(a); + #else + simde_poly64x1_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_f32 + #define vreinterpret_p64_f32 simde_vreinterpret_p64_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p64_f64(a); + #else + simde_poly64x1_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_f64 + #define vreinterpret_p64_f64 simde_vreinterpret_p64_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s8(a); + #else + simde_poly64x2_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s8 + #define vreinterpretq_p64_s8(a) simde_vreinterpretq_p64_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s16(a); + #else + simde_poly64x2_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s16 + #define vreinterpretq_p64_s16(a) simde_vreinterpretq_p64_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s32(a); + #else + simde_poly64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s32 + #define vreinterpretq_p64_s32(a) simde_vreinterpretq_p64_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s64(a); + #else + simde_poly64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s64 + #define vreinterpretq_p64_s64(a) simde_vreinterpretq_p64_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_p8(a); + #else + simde_poly64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_p8 + #define vreinterpretq_p64_p8(a) simde_vreinterpretq_p64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_p16(a); + #else + simde_poly64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_p16 + #define vreinterpretq_p64_p16(a) simde_vreinterpretq_p64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_f32(a); + #else + simde_poly64x2_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_f32 + #define vreinterpretq_p64_f32(a) simde_vreinterpretq_p64_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p64_f64(a); + #else + simde_poly64x2_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_f64 + #define vreinterpretq_p64_f64(a) simde_vreinterpretq_p64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p8_f16(a); + #else + simde_poly8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_p8_f16 + #define vreinterpret_p8_f16(a) simde_vreinterpret_p8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p8_f16(a); + #else + simde_poly8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_p8_f16 + #define vreinterpretq_p8_f16(a) simde_vreinterpretq_p8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p64_f16(a); + #else + simde_poly64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_p64_f16 + #define vreinterpretq_p64_f16 simde_vreinterpretq_p64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s8_p8(a); + #else + simde_int8x8_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p8 + #define vreinterpret_s8_p8 simde_vreinterpret_s8_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s8_p16(a); + #else + simde_int8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p16 + #define vreinterpret_s8_p16 simde_vreinterpret_s8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s8_p64(a); + #else + simde_int8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p64 + #define vreinterpret_s8_p64 simde_vreinterpret_s8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s8_p8(a); + #else + simde_int8x16_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p8 + #define vreinterpretq_s8_p8(a) simde_vreinterpretq_s8_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s8_p16(a); + #else + simde_int8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p16 + #define vreinterpretq_s8_p16(a) simde_vreinterpretq_s8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s8_p64(a); + #else + simde_int8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p64 + #define vreinterpretq_s8_p64(a) simde_vreinterpretq_s8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s16_p8(a); + #else + simde_int16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p8 + #define vreinterpret_s16_p8 simde_vreinterpret_s16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s16_p16(a); + #else + simde_int16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p16 + #define vreinterpret_s16_p16 simde_vreinterpret_s16_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s16_p64(a); + #else + simde_int16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p64 + #define vreinterpret_s16_p64 simde_vreinterpret_s16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s16_p8(a); + #else + simde_int16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p8 + #define vreinterpretq_s16_p8(a) simde_vreinterpretq_s16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s16_p16(a); + #else + simde_int16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p16 + #define vreinterpretq_s16_p16(a) simde_vreinterpretq_s16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s16_p64(a); + #else + simde_int16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p64 + #define vreinterpretq_s16_p64(a) simde_vreinterpretq_s16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s32_p8(a); + #else + simde_int32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p8 + #define vreinterpret_s32_p8 simde_vreinterpret_s32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s32_p16(a); + #else + simde_int32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p16 + #define vreinterpret_s32_p16 simde_vreinterpret_s32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s32_p64(a); + #else + simde_int32x2_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p64 + #define vreinterpret_s32_p64 simde_vreinterpret_s32_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s32_p8(a); + #else + simde_int32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p8 + #define vreinterpretq_s32_p8(a) simde_vreinterpretq_s32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s32_p16(a); + #else + simde_int32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p16 + #define vreinterpretq_s32_p16(a) simde_vreinterpretq_s32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s32_p64(a); + #else + simde_int32x4_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p64 + #define vreinterpretq_s32_p64(a) simde_vreinterpretq_s32_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s64_p8(a); + #else + simde_int64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p8 + #define vreinterpret_s64_p8 simde_vreinterpret_s64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s64_p16(a); + #else + simde_int64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p16 + #define vreinterpret_s64_p16 simde_vreinterpret_s64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s64_p64(a); + #else + simde_int64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p64 + #define vreinterpret_s64_p64 simde_vreinterpret_s64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s64_p8(a); + #else + simde_int64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p8 + #define vreinterpretq_s64_p8(a) simde_vreinterpretq_s64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s64_p16(a); + #else + simde_int64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p16 + #define vreinterpretq_s64_p16(a) simde_vreinterpretq_s64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s64_p64(a); + #else + simde_int64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p64 + #define vreinterpretq_s64_p64(a) simde_vreinterpretq_s64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_f32_p8(a); + #else + simde_float32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_p8 + #define vreinterpret_f32_p8 simde_vreinterpret_f32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_f32_p16(a); + #else + simde_float32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_p16 + #define vreinterpret_f32_p16 simde_vreinterpret_f32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p16(a); + #else + simde_float16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_p16 + #define vreinterpret_f16_p16(a) simde_vreinterpret_f16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_f32_p8(a); + #else + simde_float32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_p8 + #define vreinterpretq_f32_p8(a) simde_vreinterpretq_f32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_f32_p16(a); + #else + simde_float32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_p16 + #define vreinterpretq_f32_p16(a) simde_vreinterpretq_f32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p16(a); + #else + simde_float16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_p16 + #define vreinterpretq_f16_p16(a) simde_vreinterpretq_f16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p8(a); + #else + simde_float64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p8 + #define vreinterpret_f64_p8 simde_vreinterpret_f64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p16(a); + #else + simde_float64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p16 + #define vreinterpret_f64_p16 simde_vreinterpret_f64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p64(a); + #else + simde_float64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p64 + #define vreinterpret_f64_p64 simde_vreinterpret_f64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p8(a); + #else + simde_float64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p8 + #define vreinterpretq_f64_p8(a) simde_vreinterpretq_f64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p16(a); + #else + simde_float64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p16 + #define vreinterpretq_f64_p16(a) simde_vreinterpretq_f64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p64(a); + #else + simde_float64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p64 + #define vreinterpretq_f64_p64(a) simde_vreinterpretq_f64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p64(a); + #else + simde_float16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_p64 + #define vreinterpret_f16_p64 simde_vreinterpret_f16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p8(a); + #else + simde_float16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpret_f16_p8 + #define vreinterpret_f16_p8 simde_vreinterpret_f16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p64(a); + #else + simde_float16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_p64 + #define vreinterpretq_f16_p64(a) simde_vreinterpretq_f16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p8(a); + #else + simde_float16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vreinterpretq_f16_p8 + #define vreinterpretq_f16_p8(a) simde_vreinterpretq_f16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u8_p16(a); + #else + simde_uint8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p16 + #define vreinterpret_u8_p16 simde_vreinterpret_u8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u8_p64(a); + #else + simde_uint8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p64 + #define vreinterpret_u8_p64 simde_vreinterpret_u8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u8_p16(a); + #else + simde_uint8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p16 + #define vreinterpretq_u8_p16(a) simde_vreinterpretq_u8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u8_p64(a); + #else + simde_uint8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p64 + #define vreinterpretq_u8_p64(a) simde_vreinterpretq_u8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u16_p8(a); + #else + simde_uint16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p8 + #define vreinterpret_u16_p8 simde_vreinterpret_u16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u16_p64(a); + #else + simde_uint16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p64 + #define vreinterpret_u16_p64 simde_vreinterpret_u16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u16_p8(a); + #else + simde_uint16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p8 + #define vreinterpretq_u16_p8(a) simde_vreinterpretq_u16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u16_p64(a); + #else + simde_uint16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p64 + #define vreinterpretq_u16_p64(a) simde_vreinterpretq_u16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u32_p8(a); + #else + simde_uint32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p8 + #define vreinterpret_u32_p8 simde_vreinterpret_u32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u32_p8(a); + #else + simde_uint32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p8 + #define vreinterpretq_u32_p8(a) simde_vreinterpretq_u32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u32_p16(a); + #else + simde_uint32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p16 + #define vreinterpret_u32_p16 simde_vreinterpret_u32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u32_p16(a); + #else + simde_uint32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p16 + #define vreinterpretq_u32_p16(a) simde_vreinterpretq_u32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u32_p64(a); + #else + simde_uint32x2_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p64 + #define vreinterpret_u32_p64 simde_vreinterpret_u32_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u32_p64(a); + #else + simde_uint32x4_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p64 + #define vreinterpretq_u32_p64(a) simde_vreinterpretq_u32_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u64_p8(a); + #else + simde_uint64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p8 + #define vreinterpret_u64_p8 simde_vreinterpret_u64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u64_p8(a); + #else + simde_uint64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p8 + #define vreinterpretq_u64_p8(a) simde_vreinterpretq_u64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u64_p16(a); + #else + simde_uint64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p16 + #define vreinterpret_u64_p16 simde_vreinterpret_u64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u64_p16(a); + #else + simde_uint64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p16 + #define vreinterpretq_u64_p16(a) simde_vreinterpretq_u64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u16(a); + #else + simde_poly8x8_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u16 + #define vreinterpret_p8_u16 simde_vreinterpret_p8_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u64(a); + #else + simde_poly8x8_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u64 + #define vreinterpret_p8_u64 simde_vreinterpret_p8_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u16(a); + #else + simde_poly8x16_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u16 + #define vreinterpretq_p8_u16(a) simde_vreinterpretq_p8_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u64(a); + #else + simde_poly8x16_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u64 + #define vreinterpretq_p8_u64(a) simde_vreinterpretq_p8_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u32(a); + #else + simde_poly8x8_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u32 + #define vreinterpret_p8_u32 simde_vreinterpret_p8_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u32(a); + #else + simde_poly8x16_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u32 + #define vreinterpretq_p8_u32(a) simde_vreinterpretq_p8_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u8(a); + #else + simde_poly16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u8 + #define vreinterpret_p16_u8 simde_vreinterpret_p16_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u32(a); + #else + simde_poly16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u32 + #define vreinterpret_p16_u32 simde_vreinterpret_p16_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u64(a); + #else + simde_poly16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u64 + #define vreinterpret_p16_u64 simde_vreinterpret_p16_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u8(a); + #else + simde_poly16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u8 + #define vreinterpretq_p16_u8(a) simde_vreinterpretq_p16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u32(a); + #else + simde_poly16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u32 + #define vreinterpretq_p16_u32(a) simde_vreinterpretq_p16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u64(a); + #else + simde_poly16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u64 + #define vreinterpretq_p16_u64(a) simde_vreinterpretq_p16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u8(a); + #else + simde_poly64x1_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u8 + #define vreinterpret_p64_u8 simde_vreinterpret_p64_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u16(a); + #else + simde_poly64x1_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u16 + #define vreinterpret_p64_u16 simde_vreinterpret_p64_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u32(a); + #else + simde_poly64x1_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u32 + #define vreinterpret_p64_u32 simde_vreinterpret_p64_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u8(a); + #else + simde_poly64x2_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u8 + #define vreinterpretq_p64_u8(a) simde_vreinterpretq_p64_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u16(a); + #else + simde_poly64x2_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u16 + #define vreinterpretq_p64_u16(a) simde_vreinterpretq_p64_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u32(a); + #else + simde_poly64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u32 + #define vreinterpretq_p64_u32(a) simde_vreinterpretq_p64_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u8_p8(a); + #else + simde_uint8x8_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p8 + #define vreinterpret_u8_p8 simde_vreinterpret_u8_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u8_p8(a); + #else + simde_uint8x16_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p8 + #define vreinterpretq_u8_p8(a) simde_vreinterpretq_u8_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u16_p16(a); + #else + simde_uint16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p16 + #define vreinterpret_u16_p16 simde_vreinterpret_u16_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u16_p16(a); + #else + simde_uint16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p16 + #define vreinterpretq_u16_p16(a) simde_vreinterpretq_u16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u64_p64(a); + #else + simde_uint64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p64 + #define vreinterpret_u64_p64 simde_vreinterpret_u64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u64_p64(a); + #else + simde_uint64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p64 + #define vreinterpretq_u64_p64(a) simde_vreinterpretq_u64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u8(a); + #else + simde_poly8x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u8 + #define vreinterpret_p8_u8 simde_vreinterpret_p8_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u8(a); + #else + simde_poly8x16_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u8 + #define vreinterpretq_p8_u8(a) simde_vreinterpretq_p8_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u16(a); + #else + simde_poly16x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u16 + #define vreinterpret_p16_u16 simde_vreinterpret_p16_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u16(a); + #else + simde_poly16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u16 + #define vreinterpretq_p16_u16(a) simde_vreinterpretq_p16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u64(a); + #else + simde_poly64x1_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u64 + #define vreinterpret_p64_u64 simde_vreinterpret_p64_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u64(a); + #else + simde_poly64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u64 + #define vreinterpretq_p64_u64(a) simde_vreinterpretq_p64_u64(a) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s8(a); + #else + simde_poly128_t r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s8 + #define vreinterpretq_p128_s8(a) simde_vreinterpretq_p128_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s16(a); + #else + simde_poly128_t r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s16 + #define vreinterpretq_p128_s16(a) simde_vreinterpretq_p128_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s32(a); + #else + simde_poly128_t r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s32 + #define vreinterpretq_p128_s32(a) simde_vreinterpretq_p128_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s64(a); + #else + simde_poly128_t r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s64 + #define vreinterpretq_p128_s64(a) simde_vreinterpretq_p128_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u8(a); + #else + simde_poly128_t r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u8 + #define vreinterpretq_p128_u8(a) simde_vreinterpretq_p128_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u16(a); + #else + simde_poly128_t r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u16 + #define vreinterpretq_p128_u16(a) simde_vreinterpretq_p128_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u32(a); + #else + simde_poly128_t r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u32 + #define vreinterpretq_p128_u32(a) simde_vreinterpretq_p128_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u64(a); + #else + simde_poly128_t r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u64 + #define vreinterpretq_p128_u64(a) simde_vreinterpretq_p128_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_p8(a); + #else + simde_poly128_t r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_p8 + #define vreinterpretq_p128_p8(a) simde_vreinterpretq_p128_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_p16(a); + #else + simde_poly128_t r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_p16 + #define vreinterpretq_p128_p16(a) simde_vreinterpretq_p128_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f16(a); + #else + simde_poly128_t r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_p128_f16 + #define vreinterpretq_p128_f16(a) simde_vreinterpretq_p128_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f32(a); + #else + simde_poly128_t r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_p128_f32 + #define vreinterpretq_p128_f32(a) simde_vreinterpretq_p128_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f64(a); + #else + simde_poly128_t r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_p128_f64 + #define vreinterpretq_p128_f64(a) simde_vreinterpretq_p128_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s8_p128(a); + #else + simde_int8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_s8_p128 + #define vreinterpretq_s8_p128(a) simde_vreinterpretq_s8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s16_p128(a); + #else + simde_int16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_s16_p128 + #define vreinterpretq_s16_p128(a) simde_vreinterpretq_s16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s32_p128(a); + #else + simde_int32x4_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_s32_p128 + #define vreinterpretq_s32_p128(a) simde_vreinterpretq_s32_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s64_p128(a); + #else + simde_int64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_s64_p128 + #define vreinterpretq_s64_p128(a) simde_vreinterpretq_s64_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u8_p128(a); + #else + simde_uint8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_u8_p128 + #define vreinterpretq_u8_p128(a) simde_vreinterpretq_u8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u16_p128(a); + #else + simde_uint16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_u16_p128 + #define vreinterpretq_u16_p128(a) simde_vreinterpretq_u16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u32_p128(a); + #else + simde_uint32x4_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_u32_p128 + #define vreinterpretq_u32_p128(a) simde_vreinterpretq_u32_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u64_p128(a); + #else + simde_uint64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_u64_p128 + #define vreinterpretq_u64_p128(a) simde_vreinterpretq_u64_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p8_p128(a); + #else + simde_poly8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_p8_p128 + #define vreinterpretq_p8_p128(a) simde_vreinterpretq_p8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p16_p128(a); + #else + simde_poly16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_p16_p128 + #define vreinterpretq_p16_p128(a) simde_vreinterpretq_p16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_f16_p128(a); + #else + simde_float16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_f16_p128 + #define vreinterpretq_f16_p128(a) simde_vreinterpretq_f16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_f64_p128(a); + #else + simde_float64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRYPTO))) + #undef vreinterpretq_f64_p128 + #define vreinterpretq_f64_p128(a) simde_vreinterpretq_f64_p128(a) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s8(a); + #else + simde_bfloat16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_s8 + #define vreinterpret_bf16_s8(a) simde_vreinterpret_bf16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s16(a); + #else + simde_bfloat16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_s16 + #define vreinterpret_bf16_s16(a) simde_vreinterpret_bf16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s32(a); + #else + simde_bfloat16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_s32 + #define vreinterpret_bf16_s32(a) simde_vreinterpret_bf16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s64(a); + #else + simde_bfloat16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_s64 + #define vreinterpret_bf16_s64(a) simde_vreinterpret_bf16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u8(a); + #else + simde_bfloat16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_u8 + #define vreinterpret_bf16_u8(a) simde_vreinterpret_bf16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u16(a); + #else + simde_bfloat16x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_u16 + #define vreinterpret_bf16_u16(a) simde_vreinterpret_bf16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u32(a); + #else + simde_bfloat16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_u32 + #define vreinterpret_bf16_u32(a) simde_vreinterpret_bf16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u64(a); + #else + simde_bfloat16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_u64 + #define vreinterpret_bf16_u64(a) simde_vreinterpret_bf16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_f32(a); + #else + simde_bfloat16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_f32 + #define vreinterpret_bf16_f32 simde_vreinterpret_bf16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_f64(a); + #else + simde_bfloat16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_f64 + #define vreinterpret_bf16_f64 simde_vreinterpret_bf16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s8(a); + #else + simde_bfloat16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_s8 + #define vreinterpretq_bf16_s8(a) simde_vreinterpretq_bf16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s16(a); + #else + simde_bfloat16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_s16 + #define vreinterpretq_bf16_s16(a) simde_vreinterpretq_bf16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s32(a); + #else + simde_bfloat16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_s32 + #define vreinterpretq_bf16_s32(a) simde_vreinterpretq_bf16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s64(a); + #else + simde_bfloat16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_s64 + #define vreinterpretq_bf16_s64(a) simde_vreinterpretq_bf16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u8(a); + #else + simde_bfloat16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_u8 + #define vreinterpretq_bf16_u8(a) simde_vreinterpretq_bf16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u16(a); + #else + simde_bfloat16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_u16 + #define vreinterpretq_bf16_u16(a) simde_vreinterpretq_bf16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u32(a); + #else + simde_bfloat16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_u32 + #define vreinterpretq_bf16_u32(a) simde_vreinterpretq_bf16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u64(a); + #else + simde_bfloat16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_u64 + #define vreinterpretq_bf16_u64(a) simde_vreinterpretq_bf16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_f32(a); + #else + simde_bfloat16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_f32 + #define vreinterpretq_bf16_f32 simde_vreinterpretq_bf16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_f64(a); + #else + simde_bfloat16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_f64 + #define vreinterpretq_bf16_f64 simde_vreinterpretq_bf16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s8_bf16(a); + #else + simde_int8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_s8_bf16 + #define vreinterpret_s8_bf16(a) simde_vreinterpret_s8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s16_bf16(a); + #else + simde_int16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_s16_bf16 + #define vreinterpret_s16_bf16(a) simde_vreinterpret_s16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s32_bf16(a); + #else + simde_int32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_s32_bf16 + #define vreinterpret_s32_bf16(a) simde_vreinterpret_s32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s64_bf16(a); + #else + simde_int64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_s64_bf16 + #define vreinterpret_s64_bf16(a) simde_vreinterpret_s64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u8_bf16(a); + #else + simde_uint8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_u8_bf16 + #define vreinterpret_u8_bf16(a) simde_vreinterpret_u8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u16_bf16(a); + #else + simde_uint16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_u16_bf16 + #define vreinterpret_u16_bf16(a) simde_vreinterpret_u16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u32_bf16(a); + #else + simde_uint32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_u32_bf16 + #define vreinterpret_u32_bf16(a) simde_vreinterpret_u32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u64_bf16(a); + #else + simde_uint64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_u64_bf16 + #define vreinterpret_u64_bf16(a) simde_vreinterpret_u64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_f32_bf16(a); + #else + simde_float32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_f32_bf16 + #define vreinterpret_f32_bf16 simde_vreinterpret_f32_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_f64_bf16(a); + #else + simde_float64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_f64_bf16 + #define vreinterpret_f64_bf16 simde_vreinterpret_f64_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s8_bf16(a); + #else + simde_int8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_s8_bf16 + #define vreinterpretq_s8_bf16(a) simde_vreinterpretq_s8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s16_bf16(a); + #else + simde_int16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_s16_bf16 + #define vreinterpretq_s16_bf16(a) simde_vreinterpretq_s16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s32_bf16(a); + #else + simde_int32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_s32_bf16 + #define vreinterpretq_s32_bf16(a) simde_vreinterpretq_s32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s64_bf16(a); + #else + simde_int64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_s64_bf16 + #define vreinterpretq_s64_bf16(a) simde_vreinterpretq_s64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u8_bf16(a); + #else + simde_uint8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_u8_bf16 + #define vreinterpretq_u8_bf16(a) simde_vreinterpretq_u8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u16_bf16(a); + #else + simde_uint16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_u16_bf16 + #define vreinterpretq_u16_bf16(a) simde_vreinterpretq_u16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u32_bf16(a); + #else + simde_uint32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_u32_bf16 + #define vreinterpretq_u32_bf16(a) simde_vreinterpretq_u32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u64_bf16(a); + #else + simde_uint64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_u64_bf16 + #define vreinterpretq_u64_bf16(a) simde_vreinterpretq_u64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_f32_bf16(a); + #else + simde_float32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_f32_bf16 + #define vreinterpretq_f32_bf16 simde_vreinterpretq_f32_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_f64_bf16(a); + #else + simde_float64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_f64_bf16 + #define vreinterpretq_f64_bf16 simde_vreinterpretq_f64_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p8(a); + #else + simde_bfloat16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_p8 + #define vreinterpret_bf16_p8(a) simde_vreinterpret_bf16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p16(a); + #else + simde_bfloat16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_p16 + #define vreinterpret_bf16_p16(a) simde_vreinterpret_bf16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p64(a); + #else + simde_bfloat16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_bf16_p64 + #define vreinterpret_bf16_p64(a) simde_vreinterpret_bf16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p8(a); + #else + simde_bfloat16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_p8 + #define vreinterpretq_bf16_p8(a) simde_vreinterpretq_bf16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p16(a); + #else + simde_bfloat16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_p16 + #define vreinterpretq_bf16_p16(a) simde_vreinterpretq_bf16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p64(a); + #else + simde_bfloat16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_p64 + #define vreinterpretq_bf16_p64(a) simde_vreinterpretq_bf16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p8_bf16(a); + #else + simde_poly8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_p8_bf16 + #define vreinterpret_p8_bf16(a) simde_vreinterpret_p8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p16_bf16(a); + #else + simde_poly16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_p16_bf16 + #define vreinterpret_p16_bf16(a) simde_vreinterpret_p16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p64_bf16(a); + #else + simde_poly64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpret_p64_bf16 + #define vreinterpret_p64_bf16(a) simde_vreinterpret_p64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p8_bf16(a); + #else + simde_poly8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_p8_bf16 + #define vreinterpretq_p8_bf16(a) simde_vreinterpretq_p8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p16_bf16(a); + #else + simde_poly16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_p16_bf16 + #define vreinterpretq_p16_bf16(a) simde_vreinterpretq_p16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p64_bf16(a); + #else + simde_poly64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_p64_bf16 + #define vreinterpretq_p64_bf16(a) simde_vreinterpretq_p64_bf16(a) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p128_bf16(a); + #else + simde_poly128_t r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_p128_bf16 + #define vreinterpretq_p128_bf16(a) simde_vreinterpretq_p128_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p128(a); + #else + simde_bfloat16x8_t r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) + #undef vreinterpretq_bf16_p128 + #define vreinterpretq_bf16_p128(a) simde_vreinterpretq_bf16_p128(a) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/rev16.h b/arm/neon/rev16.h index 55fe38c2e..3cbd3df71 100644 --- a/arm/neon/rev16.h +++ b/arm/neon/rev16.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_REV16_H) @@ -47,6 +49,9 @@ simde_vrev16_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(6, 7, 4, 5, 2, 3, 0, 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 1, 0, 3, 2, 5, 4, 7, 6); #else @@ -98,6 +103,9 @@ simde_vrev16q_s8(simde_int8x16_t a) { r_.m128i = _mm_shuffle_epi8(a_.m128i, _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else @@ -129,6 +137,34 @@ simde_vrev16q_u8(simde_uint8x16_t a) { #define vrev16q_u8(a) simde_vrev16q_u8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev16_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev16_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev16_p8 + #define vrev16_p8(a) simde_vrev16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev16q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev16q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev16q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev16q_p8 + #define vrev16q_p8(a) simde_vrev16q_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/rev32.h b/arm/neon/rev32.h index 3fac26505..e3dff42cc 100644 --- a/arm/neon/rev32.h +++ b/arm/neon/rev32.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_REV32_H) @@ -47,6 +49,9 @@ simde_vrev32_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(4, 5, 6, 7, 0, 1, 2, 3)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4); #else @@ -76,6 +81,9 @@ simde_vrev32_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {1, 0, 3, 2}; + r_.sv64 = __riscv_vrgather_vv_i16m1(a_.sv64, __riscv_vle16_v_u16m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 1, 0, 3, 2); #else @@ -142,6 +150,9 @@ simde_vrev32q_s8(simde_int8x16_t a) { 4, 5, 6, 7, 0, 1, 2, 3)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); #else @@ -172,7 +183,10 @@ simde_vrev32q_s16(simde_int16x8_t a) { r_, a_ = simde_int16x8_to_private(a); - #if defined(SIMDE_X86_SSSE3_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6}; + r_.sv128 = __riscv_vrgather_vv_i16m1(a_.sv128, __riscv_vle16_v_u16m1(shuffle_idx, 8), 8); + #elif defined(SIMDE_X86_SSSE3_NATIVE) r_.m128i = _mm_shuffle_epi8(a_.m128i, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); #elif defined(SIMDE_X86_SSE2_NATIVE) @@ -226,6 +240,62 @@ simde_vrev32q_u16(simde_uint16x8_t a) { #define vrev32q_u16(a) simde_vrev32q_u16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev32_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32_p8 + #define vrev32_p8(a) simde_vrev32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vrev32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32_p16(a); + #else + return simde_vreinterpret_p16_s16(simde_vrev32_s16(simde_vreinterpret_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32_p16 + #define vrev32_p16(a) simde_vrev32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev32q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev32q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32q_p8 + #define vrev32q_p8(a) simde_vrev32q_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vrev32q_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32q_p16(a); + #else + return simde_vreinterpretq_p16_s16(simde_vrev32q_s16(simde_vreinterpretq_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32q_p16 + #define vrev32q_p16(a) simde_vrev32q_p16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/rev64.h b/arm/neon/rev64.h index 274f08126..e6c85bb52 100644 --- a/arm/neon/rev64.h +++ b/arm/neon/rev64.h @@ -23,11 +23,10 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ -/* N.B. CM: vrev64_f16 and vrev64q_f16 are omitted as - * SIMDe has no 16-bit floating point support. */ - #if !defined(SIMDE_ARM_NEON_REV64_H) #define SIMDE_ARM_NEON_REV64_H @@ -50,6 +49,9 @@ simde_vrev64_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(0, 1, 2, 3, 4, 5, 6, 7)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {7, 6, 5, 4, 3, 2, 1, 0}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 7, 6, 5, 4, 3, 2, 1, 0); #else @@ -79,6 +81,9 @@ simde_vrev64_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (0 << 6) | (1 << 4) | (2 << 2) | (3 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {3, 2, 1, 0}; + r_.sv64 = __riscv_vrgather_vv_i16m1(a_.sv64, __riscv_vle16_v_u16m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 3, 2, 1, 0); #else @@ -108,6 +113,9 @@ simde_vrev64_s32(simde_int32x2_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint32_t shuffle_idx[] = {1, 0}; + r_.sv64 = __riscv_vrgather_vv_i32m1(a_.sv64, __riscv_vle32_v_u32m1(shuffle_idx, 2), 2); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 0); #else @@ -167,6 +175,20 @@ simde_vrev64_u32(simde_uint32x2_t a) { #define vrev64_u32(a) simde_vrev64_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrev64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrev64_f16(a); + #else + return simde_vreinterpret_f16_s16(simde_vrev64_s16(simde_vreinterpret_s16_f16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrev64_f16 + #define vrev64_f16(a) simde_vrev64_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrev64_f32(simde_float32x2_t a) { @@ -202,6 +224,9 @@ simde_vrev64q_s8(simde_int8x16_t a) { 0, 1, 2, 3, 4, 5, 6, 7)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); #else @@ -241,6 +266,9 @@ simde_vrev64q_s16(simde_int16x8_t a) { (0 << 6) | (1 << 4) | (2 << 2) | (3 << 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4}; + r_.sv128 = __riscv_vrgather_vv_i16m1(a_.sv128, __riscv_vle16_v_u16m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4); #else @@ -275,6 +303,9 @@ simde_vrev64q_s32(simde_int32x4_t a) { r_.m128i = _mm_shuffle_epi32(a_.m128i, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint32_t shuffle_idx[] = {1, 0, 3, 2}; + r_.sv128 = __riscv_vrgather_vv_i32m1(a_.sv128, __riscv_vle32_v_u32m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 0, 3, 2); #else @@ -334,6 +365,20 @@ simde_vrev64q_u32(simde_uint32x4_t a) { #define vrev64q_u32(a) simde_vrev64q_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrev64q_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrev64q_f16(a); + #else + return simde_vreinterpretq_f16_s16(simde_vrev64q_s16(simde_vreinterpretq_s16_f16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrev64q_f16 + #define vrev64q_f16(a) simde_vrev64q_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrev64q_f32(simde_float32x4_t a) { @@ -348,6 +393,62 @@ simde_vrev64q_f32(simde_float32x4_t a) { #define vrev64q_f32(a) simde_vrev64q_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev64_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64_p8 + #define vrev64_p8(a) simde_vrev64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vrev64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64_p16(a); + #else + return simde_vreinterpret_p16_s16(simde_vrev64_s16(simde_vreinterpret_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64_p16 + #define vrev64_p16(a) simde_vrev64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev64q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev64q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64q_p8 + #define vrev64q_p8(a) simde_vrev64q_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vrev64q_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64q_p16(a); + #else + return simde_vreinterpretq_p16_s16(simde_vrev64q_s16(simde_vreinterpretq_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64q_p16 + #define vrev64q_p16(a) simde_vrev64q_p16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/rnd.h b/arm/neon/rnd.h index 9a007b77c..64ea7ec19 100644 --- a/arm/neon/rnd.h +++ b/arm/neon/rnd.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RND_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndh_f16(a); + #else + return simde_float16_from_float32(simde_math_truncf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrndh_f16 + #define vrndh_f16(a) simde_vrndh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrnd_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrnd_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrnd_f16 + #define vrnd_f16(a) simde_vrnd_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrnd_f64(simde_float64x1_t a) { #define vrnd_f64(a) simde_vrnd_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrndq_f16 + #define vrndq_f16(a) simde_vrndq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndq_f32(simde_float32x4_t a) { @@ -125,7 +186,7 @@ simde_vrndq_f64(simde_float64x2_t a) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.m128d = _mm_round_pd(a_.m128d, _MM_FROUND_TO_ZERO); #elif defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - r_.m128d = _mm_trunc_ps(a_.m128d); + r_.m128d = _mm_trunc_pd(a_.m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/rnd32x.h b/arm/neon/rnd32x.h new file mode 100644 index 000000000..560f1ce0a --- /dev/null +++ b/arm/neon/rnd32x.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND32X_H) +#define SIMDE_ARM_NEON_RND32X_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd32x_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd32x_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32x_f32 + #define vrnd32x_f32(a) simde_vrnd32x_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd32x_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd32x_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32x_f64 + #define vrnd32x_f64(a) simde_vrnd32x_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd32xq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd32xq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32xq_f32 + #define vrnd32xq_f32(a) simde_vrnd32xq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd32xq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd32xq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0))&& !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32xq_f64 + #define vrnd32xq_f64(a) simde_vrnd32xq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND32X_H) */ diff --git a/arm/neon/rnd32z.h b/arm/neon/rnd32z.h new file mode 100644 index 000000000..2b8fe28a5 --- /dev/null +++ b/arm/neon/rnd32z.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND32Z_H) +#define SIMDE_ARM_NEON_RND32Z_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd32z_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd32z_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32z_f32 + #define vrnd32z_f32(a) simde_vrnd32z_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd32z_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd32z_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32z_f64 + #define vrnd32z_f64(a) simde_vrnd32z_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd32zq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd32zq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32zq_f32 + #define vrnd32zq_f32(a) simde_vrnd32zq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd32zq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd32zq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0))&& !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd32zq_f64 + #define vrnd32zq_f64(a) simde_vrnd32zq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND32Z_H) */ diff --git a/arm/neon/rnd64x.h b/arm/neon/rnd64x.h new file mode 100644 index 000000000..76f5df6b8 --- /dev/null +++ b/arm/neon/rnd64x.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND64X_H) +#define SIMDE_ARM_NEON_RND64X_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd64x_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd64x_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64x_f32 + #define vrnd64x_f32(a) simde_vrnd64x_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd64x_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd64x_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64x_f64 + #define vrnd64x_f64(a) simde_vrnd64x_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd64xq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd64xq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64xq_f32 + #define vrnd64xq_f32(a) simde_vrnd64xq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd64xq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd64xq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64xq_f64 + #define vrnd64xq_f64(a) simde_vrnd64xq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND64X_H) */ diff --git a/arm/neon/rnd64z.h b/arm/neon/rnd64z.h new file mode 100644 index 000000000..cff68b3e8 --- /dev/null +++ b/arm/neon/rnd64z.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND64Z_H) +#define SIMDE_ARM_NEON_RND64Z_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd64z_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd64z_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64z_f32 + #define vrnd64z_f32(a) simde_vrnd64z_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd64z_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd64z_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64z_f64 + #define vrnd64z_f64(a) simde_vrnd64z_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd64zq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) + return vrnd64zq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64zq_f32 + #define vrnd64zq_f32(a) simde_vrnd64zq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd64zq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) + return vrnd64zq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) + #undef vrnd64zq_f64 + #define vrnd64zq_f64(a) simde_vrnd64zq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND64Z_H) */ diff --git a/arm/neon/rnda.h b/arm/neon/rnda.h new file mode 100644 index 000000000..169002b5c --- /dev/null +++ b/arm/neon/rnda.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_RNDA_H) +#define SIMDE_ARM_NEON_RNDA_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndah_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndah_f16(a); + #else + return simde_float16_from_float32(simde_math_roundf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrndah_f16 + #define vrndah_f16(a) simde_vrndah_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrnda_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrnda_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1_rm(a_.sv64, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrnda_f16 + #define vrnda_f16(a) simde_vrnda_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnda_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrnda_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv64 = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv64, 0, 2), 2); + #else + simde_float32 nan = SIMDE_MATH_NAN; + vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + r_.sv64 = __riscv_vfmerge_vfm_f32m1(__riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv64, 0, 2), 2), \ + nan, mask, 2); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + #endif + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrnda_f32 + #define vrnda_f32(a) simde_vrnda_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnda_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrnda_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv64 = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv64, 0, 1), 1); + #else + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + r_.sv64 = __riscv_vfmerge_vfm_f64m1(__riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv64, 0, 1), 1), \ + nan, mask, 1); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + #endif + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnda_f64 + #define vrnda_f64(a) simde_vrnda_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndaq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndaq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1_rm(a_.sv128, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + #endif + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) + #undef vrndaq_f16 + #define vrndaq_f16(a) simde_vrndaq_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrndaq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndaq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv128 = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv128, 0, 4), 4); + #else + simde_float32 nan = SIMDE_MATH_NAN; + vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + r_.sv128 = __riscv_vfmerge_vfm_f32m1(__riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv128, 0, 4), 4), \ + nan, mask, 4); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + #endif + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndaq_f32 + #define vrndaq_f32(a) simde_vrndaq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrndaq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndaq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv128 = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv128, 0, 2), 2); + #else + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + r_.sv128 = __riscv_vfmerge_vfm_f64m1(__riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv128, 0, 2), 2), \ + nan, mask, 2); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + #endif + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndaq_f64 + #define vrndaq_f64(a) simde_vrndaq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RNDA_H) */ diff --git a/arm/neon/rndi.h b/arm/neon/rndi.h index b15949b55..48c7323a6 100644 --- a/arm/neon/rndi.h +++ b/arm/neon/rndi.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDI_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndih_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndih_f16(a); + #else + return simde_float16_from_float32(simde_math_nearbyintf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vrndih_f16 + #define vrndih_f16(a) simde_vrndih_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndi_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndi_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndih_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vrndi_f16 + #define vrndi_f16(a) simde_vrndi_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndi_f32(simde_float32x2_t a) { @@ -51,7 +89,7 @@ simde_vrndi_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndi_f32 #define vrndi_f32(a) simde_vrndi_f32(a) #endif @@ -74,11 +112,34 @@ simde_vrndi_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndi_f64 #define vrndi_f64(a) simde_vrndi_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndiq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndiq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndih_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) + #undef vrndiq_f16 + #define vrndiq_f16(a) simde_vrndiq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndiq_f32(simde_float32x4_t a) { @@ -101,7 +162,7 @@ simde_vrndiq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndiq_f32 #define vrndiq_f32(a) simde_vrndiq_f32(a) #endif @@ -128,7 +189,7 @@ simde_vrndiq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndiq_f64 #define vrndiq_f64(a) simde_vrndiq_f64(a) #endif diff --git a/arm/neon/rndm.h b/arm/neon/rndm.h index 386c0ecab..5f8d0498a 100644 --- a/arm/neon/rndm.h +++ b/arm/neon/rndm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDM_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndmh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndmh_f16(a); + #else + return simde_float16_from_float32(simde_math_floorf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndmh_f16 + #define vrndmh_f16(a) simde_vrndmh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndm_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndm_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndmh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndm_f16 + #define vrndm_f16(a) simde_vrndm_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndm_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrndm_f64(simde_float64x1_t a) { #define vrndm_f64(a) simde_vrndm_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndmq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndmq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndmh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndmq_f16 + #define vrndmq_f16(a) simde_vrndmq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndmq_f32(simde_float32x4_t a) { diff --git a/arm/neon/rndn.h b/arm/neon/rndn.h index d3d073172..b2289f497 100644 --- a/arm/neon/rndn.h +++ b/arm/neon/rndn.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDN_H) @@ -33,6 +34,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndnh_f16(simde_float16_t a) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))) && defined(SIMDE_ARM_NEON_FP16) + return vrndnh_f16(a); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + return simde_float16_from_float32(simde_math_roundevenf(a_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))) && defined(SIMDE_ARM_NEON_FP16))) + #undef vrndnh_f16 + #define vrndnh_f16(a) simde_vrndnh_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrndns_f32(simde_float32_t a) { @@ -45,11 +66,37 @@ simde_vrndns_f32(simde_float32_t a) { return simde_math_roundevenf(a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))))) #undef vrndns_f32 #define vrndns_f32(a) simde_vrndns_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndn_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndn_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndnh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndn_f16 + #define vrndn_f16(a) simde_vrndn_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndn_f32(simde_float32x2_t a) { @@ -97,6 +144,30 @@ simde_vrndn_f64(simde_float64x1_t a) { #define vrndn_f64(a) simde_vrndn_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndnq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndnq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndnh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndnq_f16 + #define vrndnq_f16(a) simde_vrndnq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndnq_f32(simde_float32x4_t a) { @@ -127,8 +198,7 @@ simde_vrndnq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrndnq_f64(simde_float64x2_t a) { - #if \ - defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vrndnq_f64(a); #else simde_float64x2_private diff --git a/arm/neon/rndp.h b/arm/neon/rndp.h index ee602a3f7..ac4f88c14 100644 --- a/arm/neon/rndp.h +++ b/arm/neon/rndp.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDP_H) @@ -33,6 +34,45 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndph_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndph_f16(a); + #else + return simde_float16_from_float32(simde_math_ceilf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndph_f16 + #define vrndph_f16(a) simde_vrndph_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndp_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndp_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndph_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndp_f16 + #define vrndp_f16(a) simde_vrndp_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndp_f32(simde_float32x2_t a) { @@ -79,6 +119,30 @@ simde_vrndp_f64(simde_float64x1_t a) { #define vrndp_f64(a) simde_vrndp_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndpq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndpq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndph_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndpq_f16 + #define vrndpq_f16(a) simde_vrndpq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndpq_f32(simde_float32x4_t a) { diff --git a/arm/neon/rndx.h b/arm/neon/rndx.h new file mode 100644 index 000000000..d12e19850 --- /dev/null +++ b/arm/neon/rndx.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RNDX_H) +#define SIMDE_ARM_NEON_RNDX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndxh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndxh_f16(a); + #else + return simde_float16_from_float32(simde_math_rintf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndxh_f16 + #define vrndxh_f16(a) simde_vrndxh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndx_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndx_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndxh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndx_f16 + #define vrndx_f16(a) simde_vrndx_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrndx_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndx_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rintf(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndx_f32 + #define vrndx_f32(a) simde_vrndx_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrndx_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndx_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rint(a_.values[i]); + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndx_f64 + #define vrndx_f64(a) simde_vrndx_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndxq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndxq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndxh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrndxq_f16 + #define vrndxq_f16(a) simde_vrndxq_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrndxq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndxq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rintf(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndxq_f32 + #define vrndxq_f32(a) simde_vrndxq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrndxq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndxq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rint(a_.values[i]); + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndxq_f64 + #define vrndxq_f64(a) simde_vrndxq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RNDX_H) */ diff --git a/arm/neon/rshl.h b/arm/neon/rshl.h index 8ffcfc666..091a9a407 100644 --- a/arm/neon/rshl.h +++ b/arm/neon/rshl.h @@ -27,7 +27,7 @@ #if !defined(SIMDE_ARM_NEON_RSHL_H) #define SIMDE_ARM_NEON_RSHL_H - +#include "../../x86/avx.h" #include "types.h" /* Notes from the implementer (Christopher Moore aka rosbif) @@ -84,7 +84,9 @@ simde_vrshld_s64(int64_t a, int64_t b) { ? 0 : (b >= 0) ? (a << b) - : ((a + (INT64_C(1) << (-b - 1))) >> -b); + : (a <= 0 + ? ((a + (INT64_C(1) << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(uint64_t, (a + (INT64_C(1) << (-b - 1)))) >> -b))); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -96,7 +98,7 @@ SIMDE_FUNCTION_ATTRIBUTES uint64_t simde_vrshld_u64(uint64_t a, int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vrshld_u64(a, HEDLEY_STATIC_CAST(uint64_t, b)); + return vrshld_u64(a, HEDLEY_STATIC_CAST(int64_t, b)); #else b = HEDLEY_STATIC_CAST(int8_t, b); return @@ -141,14 +143,16 @@ simde_vrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -189,7 +193,9 @@ simde_vrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -230,7 +236,9 @@ simde_vrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFFFFFUL))))); } #endif @@ -322,7 +330,7 @@ simde_vrshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -513,7 +521,9 @@ simde_vrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -572,7 +582,7 @@ simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -580,7 +590,9 @@ simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -634,8 +646,10 @@ simde_vrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0X7FFFFFFFUL))))); } #endif @@ -811,7 +825,7 @@ simde_vrshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/arm/neon/rshr_n.h b/arm/neon/rshr_n.h index 1eb0c11c0..bb3de79ba 100644 --- a/arm/neon/rshr_n.h +++ b/arm/neon/rshr_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSHR_N_H) @@ -41,6 +42,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_x_vrshrh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return (a >> ((n == 16) ? 15 : n)) + ((a & HEDLEY_STATIC_CAST(int16_t, UINT16_C(1) << (n - 1))) != 0); +} + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_x_vrshrh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return ((n == 16) ? 0 : (a >> n)) + ((a & (UINT32_C(1) << (n - 1))) != 0); +} + SIMDE_FUNCTION_ATTRIBUTES int32_t simde_x_vrshrs_n_s32(int32_t a, const int n) @@ -129,7 +144,7 @@ simde_vrshrq_n_s16 (const simde_int16x8_t a, const int n) #define simde_vrshrq_n_s16(a, n) vrshrq_n_s16((a), (n)) #elif SIMDE_NATURAL_VECTOR_SIZE > 0 #define simde_vrshrq_n_s16(a, n) simde_vsubq_s16(simde_vshrq_n_s16((a), (n)), simde_vreinterpretq_s16_u16( \ - simde_vtstq_u16(simde_vreinterpretq_u16_s16(a), \ + simde_vtstq_u16(simde_vreinterpretq_u16_s16(a), \ simde_vdupq_n_u16(HEDLEY_STATIC_CAST(uint16_t, 1 << ((n) - 1)))))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -314,7 +329,7 @@ simde_vrshr_n_s8 (const simde_int8x8_t a, const int n) #define simde_vrshr_n_s8(a, n) vrshr_n_s8((a), (n)) #elif SIMDE_NATURAL_VECTOR_SIZE > 0 #define simde_vrshr_n_s8(a, n) simde_vsub_s8(simde_vshr_n_s8((a), (n)), simde_vreinterpret_s8_u8( \ - simde_vtst_u8(simde_vreinterpret_u8_s8(a), \ + simde_vtst_u8(simde_vreinterpret_u8_s8(a), \ simde_vdup_n_u8(HEDLEY_STATIC_CAST(uint8_t, 1 << ((n) - 1)))))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/arm/neon/rshrn_high_n.h b/arm/neon/rshrn_high_n.h new file mode 100644 index 000000000..7897581a5 --- /dev/null +++ b/arm/neon/rshrn_high_n.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_RSHRN_HIGH_N_H + +#include "rshrn_n.h" +#include "combine.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s16(r, a, n) vrshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s16(r, a, n) simde_vcombine_s8(r, simde_vrshrn_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s16 + #define vrshrn_high_n_s16(r, a, n) simde_vrshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s32(r, a, n) vrshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s32(r, a, n) simde_vcombine_s16(r, simde_vrshrn_n_s32(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s32 + #define vrshrn_high_n_s32(r, a, n) simde_vrshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s64(r, a, n) vrshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s64(r, a, n) simde_vcombine_s32(r, simde_vrshrn_n_s64(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s64 + #define vrshrn_high_n_s64(r, a, n) simde_vrshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u16(r, a, n) vrshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u16(r, a, n) simde_vcombine_u8(r, simde_vrshrn_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u16 + #define vrshrn_high_n_u16(r, a, n) simde_vrshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u32(r, a, n) vrshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u32(r, a, n) simde_vcombine_u16(r, simde_vrshrn_n_u32(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u32 + #define vrshrn_high_n_u32(r, a, n) simde_vrshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u64(r, a, n) vrshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u64(r, a, n) simde_vcombine_u32(r, simde_vrshrn_n_u64(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u64 + #define vrshrn_high_n_u64(r, a, n) simde_vrshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) */ diff --git a/arm/neon/rsqrte.h b/arm/neon/rsqrte.h index 8b2adbe2a..7899fd2db 100644 --- a/arm/neon/rsqrte.h +++ b/arm/neon/rsqrte.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSQRTE_H) @@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrsqrteh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrteh_f16(a); + #else + #if defined(simde_math_sqrtf) + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + r_ = 1.0f / simde_math_sqrtf(a_); + return simde_float16_from_float32(r_); + #else + HEDLEY_UNREACHABLE(); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrteh_f16 + #define vrsqrteh_f16(a) simde_vrsqrteh_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrsqrtes_f32(simde_float32_t a) { @@ -119,11 +142,11 @@ simde_vrsqrte_u32(simde_uint32x2_t a) { r_; for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[i])) ; i++) { - if(a_.values[i] < 0x3FFFFFFF) { + if (a_.values[i] < 0x3FFFFFFF) { r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; - if(a_temp < 256) { + if (a_temp < 256) { a_temp = a_temp * 2 + 1; } else { a_temp = (a_temp >> 1) << 1; @@ -144,6 +167,34 @@ simde_vrsqrte_u32(simde_uint32x2_t a) { #define vrsqrte_u32(a) simde_vrsqrte_u32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrsqrte_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrte_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + #if defined(simde_math_sqrtf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrsqrteh_f16(a_.values[i]); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrte_f16 + #define vrsqrte_f16(a) simde_vrsqrte_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrsqrte_f32(simde_float32x2_t a) { @@ -254,11 +305,11 @@ simde_vrsqrteq_u32(simde_uint32x4_t a) { r_; for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[i])) ; i++) { - if(a_.values[i] < 0x3FFFFFFF) { + if (a_.values[i] < 0x3FFFFFFF) { r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; - if(a_temp < 256) { + if (a_temp < 256) { a_temp = a_temp * 2 + 1; } else { a_temp = (a_temp >> 1) << 1; @@ -279,6 +330,34 @@ simde_vrsqrteq_u32(simde_uint32x4_t a) { #define vrsqrteq_u32(a) simde_vrsqrteq_u32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrsqrteq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrteq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + #if defined(simde_math_sqrtf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrsqrteh_f16(a_.values[i]); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrteq_f16 + #define vrsqrteq_f16(a) simde_vrsqrteq_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrsqrteq_f32(simde_float32x4_t a) { diff --git a/arm/neon/rsqrts.h b/arm/neon/rsqrts.h index 3c7f720bb..612a597a1 100644 --- a/arm/neon/rsqrts.h +++ b/arm/neon/rsqrts.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSQRTS_H) @@ -37,6 +38,27 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrsqrtsh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrtsh_f16(a, b); + #else + return + simde_vmulh_f16( + simde_vsubh_f16( + SIMDE_FLOAT16_VALUE(3.0), + simde_vmulh_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrtsh_f16 + #define vrsqrtsh_f16(a, b) simde_vrsqrtsh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrsqrtss_f32(simde_float32_t a, simde_float32_t b) { @@ -65,6 +87,27 @@ simde_vrsqrtsd_f64(simde_float64_t a, simde_float64_t b) { #define vrsqrtsd_f64(a, b) simde_vrsqrtsd_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrsqrts_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrts_f16(a, b); + #else + return + simde_vmul_n_f16( + simde_vsub_f16( + simde_vdup_n_f16(SIMDE_FLOAT16_VALUE(3.0)), + simde_vmul_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrts_f16 + #define vrsqrts_f16(a, b) simde_vrsqrts_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrsqrts_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -107,6 +150,27 @@ simde_vrsqrts_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vrsqrts_f64(a, b) simde_vrsqrts_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrsqrtsq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrtsq_f16(a, b); + #else + return + simde_vmulq_n_f16( + simde_vsubq_f16( + simde_vdupq_n_f16(SIMDE_FLOAT16_VALUE(3.0)), + simde_vmulq_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vrsqrtsq_f16 + #define vrsqrtsq_f16(a, b) simde_vrsqrtsq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrsqrtsq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/arm/neon/rsubhn.h b/arm/neon/rsubhn.h new file mode 100644 index 000000000..5d195f83c --- /dev/null +++ b/arm/neon/rsubhn.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_RSUBHN_H) +#define SIMDE_ARM_NEON_RSUBHN_H + +#include "sub.h" +#include "shr_n.h" +#include "movn.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vrsubhn_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + int16_t round_cast = 1 << 7; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i16m1(__riscv_vsub_vv_i16m1(a_.sv128, b_.sv128, 8), round_cast, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, a_.values[i] - b_.values[i] + round_cast); + } + #endif + return simde_vmovn_s16(simde_vshrq_n_s16(simde_int16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s16 + #define vrsubhn_s16(a, b) simde_vrsubhn_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vrsubhn_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + int round_cast = 1 << 15; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i32m1(__riscv_vsub_vv_i32m1(a_.sv128, b_.sv128, 4), round_cast, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + #endif + return simde_vmovn_s32(simde_vshrq_n_s32(simde_int32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s32 + #define vrsubhn_s32(a, b) simde_vrsubhn_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vrsubhn_s64(simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i64m1(__riscv_vsub_vv_i64m1(a_.sv128, b_.sv128, 2), 0x80000000, 2); + return simde_vmovn_s64(simde_vshrq_n_s64(simde_int64x2_from_private(r_), 32)); + #else + int64_t round_cast = 1ll << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s64 + #define vrsubhn_s64(a, b) simde_vrsubhn_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vrsubhn_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + uint16_t round_cast = 1 << 7; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u16m1(__riscv_vsub_vv_u16m1(a_.sv128, b_.sv128, 8), round_cast, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] - b_.values[i] + round_cast); + } + #endif + return simde_vmovn_u16(simde_vshrq_n_u16(simde_uint16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u16 + #define vrsubhn_u16(a, b) simde_vrsubhn_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vrsubhn_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t round_cast = 1 << 15; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u32m1(__riscv_vsub_vv_u32m1(a_.sv128, b_.sv128, 4), round_cast, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + #endif + return simde_vmovn_u32(simde_vshrq_n_u32(simde_uint32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u32 + #define vrsubhn_u32(a, b) simde_vrsubhn_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vrsubhn_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u64m1(__riscv_vsub_vv_u64m1(a_.sv128, b_.sv128, 2), 0x80000000, 2); + return simde_vmovn_u64(simde_vshrq_n_u64(simde_uint64x2_from_private(r_), 32)); + #else + uint64_t round_cast = 1ull << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u64 + #define vrsubhn_u64(a, b) simde_vrsubhn_u64((a), (b)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSUBHN_H) */ diff --git a/arm/neon/rsubhn_high.h b/arm/neon/rsubhn_high.h new file mode 100644 index 000000000..d7b19849e --- /dev/null +++ b/arm/neon/rsubhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RSUBHN_HIGH_H) +#define SIMDE_ARM_NEON_RSUBHN_HIGH_H + +#include "rsubhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s16(r, a, b) vrsubhn_high_s16((r), (a), (b)) +#else + #define simde_vrsubhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vrsubhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s16 + #define vrsubhn_high_s16(r, a, b) simde_vrsubhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s32(r, a, b) vrsubhn_high_s32((r), (a), (b)) +#else + #define simde_vrsubhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vrsubhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s32 + #define vrsubhn_high_s32(r, a, b) simde_vrsubhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s64(r, a, b) vrsubhn_high_s64((r), (a), (b)) +#else + #define simde_vrsubhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vrsubhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s64 + #define vrsubhn_high_s64(r, a, b) simde_vrsubhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u16(r, a, b) vrsubhn_high_u16((r), (a), (b)) +#else + #define simde_vrsubhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vrsubhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u16 + #define vrsubhn_high_u16(r, a, b) simde_vrsubhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u32(r, a, b) vrsubhn_high_u32((r), (a), (b)) +#else + #define simde_vrsubhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vrsubhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u32 + #define vrsubhn_high_u32(r, a, b) simde_vrsubhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u64(r, a, b) vrsubhn_high_u64((r), (a), (b)) +#else + #define simde_vrsubhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vrsubhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u64 + #define vrsubhn_high_u64(r, a, b) simde_vrsubhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSUBHN_HIGH_H) */ diff --git a/arm/neon/set_lane.h b/arm/neon/set_lane.h index 70291143a..57813d31b 100644 --- a/arm/neon/set_lane.h +++ b/arm/neon/set_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SET_LANE_H) @@ -33,6 +34,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vset_lane_f16(simde_float16_t a, simde_float16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_t r; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_(vset_lane_f16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_float16x4_private v_ = simde_float16x4_to_private(v); + v_.values[lane] = a; + r = simde_float16x4_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vset_lane_f16 + #define vset_lane_f16(a, b, c) simde_vset_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vset_lane_f32(simde_float32_t a, simde_float32x2_t v, const int lane) @@ -226,6 +247,26 @@ simde_vset_lane_u64(uint64_t a, simde_uint64x1_t v, const int lane) #define vset_lane_u64(a, b, c) simde_vset_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsetq_lane_f16(simde_float16_t a, simde_float16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_t r; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_(vsetq_lane_f16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_float16x8_private v_ = simde_float16x8_to_private(v); + v_.values[lane] = a; + r = simde_float16x8_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) + #undef vsetq_lane_f16 + #define vsetq_lane_f16(a, b, c) simde_vsetq_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vsetq_lane_f32(simde_float32_t a, simde_float32x4_t v, const int lane) @@ -416,6 +457,160 @@ simde_vsetq_lane_u64(uint64_t a, simde_uint64x2_t v, const int lane) #define vsetq_lane_u64(a, b, c) simde_vsetq_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vset_lane_p8(simde_poly8_t a, simde_poly8x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8_t r; + simde_poly8x8_private v_ = simde_poly8x8_to_private(v); + v_.values[lane] = a; + r = simde_poly8x8_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p8(a, b, c) vset_lane_p8((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vset_lane_p8 + #define vset_lane_p8(a, b, c) simde_vset_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vset_lane_p16(simde_poly16_t a, simde_poly16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4_t r; + simde_poly16x4_private v_ = simde_poly16x4_to_private(v); + v_.values[lane] = a; + r = simde_poly16x4_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p16(a, b, c) vset_lane_p16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vset_lane_p16 + #define vset_lane_p16(a, b, c) simde_vset_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vset_lane_p64(simde_poly64_t a, simde_poly64x1_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1_t r; + simde_poly64x1_private v_ = simde_poly64x1_to_private(v); + v_.values[lane] = a; + r = simde_poly64x1_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p64(a, b, c) vset_lane_p64((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vset_lane_p64 + #define vset_lane_p64(a, b, c) simde_vset_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vsetq_lane_p8(simde_poly8_t a, simde_poly8x16_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16_t r; + simde_poly8x16_private v_ = simde_poly8x16_to_private(v); + v_.values[lane] = a; + r = simde_poly8x16_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p8(a, b, c) vsetq_lane_p8((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vsetq_lane_p8 + #define vsetq_lane_p8(a, b, c) simde_vsetq_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vsetq_lane_p16(simde_poly16_t a, simde_poly16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8_t r; + simde_poly16x8_private v_ = simde_poly16x8_to_private(v); + v_.values[lane] = a; + r = simde_poly16x8_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p16(a, b, c) vsetq_lane_p16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vsetq_lane_p16 + #define vsetq_lane_p16(a, b, c) simde_vsetq_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vsetq_lane_p64(simde_poly64_t a, simde_poly64x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2_t r; + simde_poly64x2_private v_ = simde_poly64x2_to_private(v); + v_.values[lane] = a; + r = simde_poly64x2_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p64(a, b, c) vsetq_lane_p64((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) + #undef vsetq_lane_p64 + #define vsetq_lane_p64(a, b, c) simde_vsetq_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vset_lane_bf16(simde_bfloat16_t a, simde_bfloat16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4_t r; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vset_lane_bf16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_bfloat16x4_private v_ = simde_bfloat16x4_to_private(v); + v_.values[lane] = a; + r = simde_bfloat16x4_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vset_lane_bf16 + #define vset_lane_bf16(a, b, c) simde_vset_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vsetq_lane_bf16(simde_bfloat16_t a, simde_bfloat16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8_t r; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_(vsetq_lane_bf16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_bfloat16x8_private v_ = simde_bfloat16x8_to_private(v); + v_.values[lane] = a; + r = simde_bfloat16x8_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vsetq_lane_bf16 + #define vsetq_lane_bf16(a, b, c) simde_vsetq_lane_bf16((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/sha1.h b/arm/neon/sha1.h new file mode 100644 index 000000000..d1f680390 --- /dev/null +++ b/arm/neon/sha1.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA1_H) +#define SIMDE_ARM_NEON_SHA1_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROL(operand, N, shift) (((operand) >> (N-shift)) | ((operand) << (shift))) + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vsha1h_u32(uint32_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1h_u32(a); + #else + return ROL(a, 32, 30); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1h_u32 + #define vsha1h_u32(a) simde_vsha1h_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1cq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1cq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = (((x_.values[2] ^ x_.values[3]) & x_.values[1]) ^ x_.values[3]); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1cq_u32 + #define vsha1cq_u32(hash_abcd, hash_e, wk) simde_vsha1cq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1mq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1mq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = ((x_.values[1] & x_.values[2]) | ((x_.values[1] | x_.values[2]) & x_.values[3])); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1mq_u32 + #define vsha1mq_u32(hash_abcd, hash_e, wk) simde_vsha1mq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1pq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1pq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = (x_.values[1] ^ x_.values[2] ^ x_.values[3]); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1pq_u32 + #define vsha1pq_u32(hash_abcd, hash_e, wk) simde_vsha1pq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7, simde_uint32x4_t w8_11) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1su0q_u32(w0_3, w4_7, w8_11); + #else + simde_uint32x4_private + r_, + x_ = simde_uint32x4_to_private(w0_3), + y_ = simde_uint32x4_to_private(w4_7), + z_ = simde_uint32x4_to_private(w8_11); + r_.values[3] = y_.values[1]; + r_.values[2] = y_.values[0]; + r_.values[1] = x_.values[3]; + r_.values[0] = x_.values[2]; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + r_.values[i] = r_.values[i] ^ x_.values[i] ^ z_.values[i]; + } + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1su0q_u32 + #define vsha1su0q_u32(w0_3, w4_7, w8_11) simde_vsha1su0q_u32((w0_3), (w4_7), (w8_11)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t tw12_15) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1su1q_u32(tw0_3, tw12_15); + #else + simde_uint32x4_private + r_, + T_, + x_ = simde_uint32x4_to_private(tw0_3), + y_ = simde_uint32x4_to_private(tw12_15); + T_.values[0] = x_.values[0] ^ y_.values[1]; + T_.values[1] = x_.values[1] ^ y_.values[2]; + T_.values[2] = x_.values[2] ^ y_.values[3]; + T_.values[3] = x_.values[3] ^ 0x0; + r_.values[0] = ROL(T_.values[0], 32, 1); + r_.values[1] = ROL(T_.values[1], 32, 1); + r_.values[2] = ROL(T_.values[2], 32, 1); + r_.values[3] = ROL(T_.values[3], 32, 1) ^ ROL(T_.values[0], 32, 2); + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha1su1q_u32 + #define vsha1su1q_u32(tw0_3, tw12_15) simde_vsha1su1q_u32((tw0_3), (tw12_15)) +#endif + +#undef ROL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA1_H) */ diff --git a/arm/neon/sha256.h b/arm/neon/sha256.h new file mode 100644 index 000000000..38fe3b4e0 --- /dev/null +++ b/arm/neon/sha256.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA256_H) +#define SIMDE_ARM_NEON_SHA256_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define SIMDE_ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define SIMDE_ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define SIMDE_LSR(operand, shift) ((operand) >> (shift)) +#define SIMDE_LSL(operand, shift) ((operand) << (shift)) + +static uint32_t simde_SHAchoose(uint32_t x, uint32_t y, uint32_t z) { + return (((y ^ z) & x) ^ z); +} + +static uint32_t simde_SHAmajority(uint32_t x, uint32_t y, uint32_t z) { + return ((x & y) | ((x | y) & z)); +} + +static uint32_t simde_SHAhashSIGMA0(uint32_t x) { + return SIMDE_ROR32(x, 2) ^ SIMDE_ROR32(x, 13) ^ SIMDE_ROR32(x, 22); +} + +static uint32_t simde_SHAhashSIGMA1(uint32_t x) { + return SIMDE_ROR32(x, 6) ^ SIMDE_ROR32(x, 11) ^ SIMDE_ROR32(x, 25); +} + +static simde_uint32x4_t +x_simde_sha256hash(simde_uint32x4_t x, simde_uint32x4_t y, simde_uint32x4_t w, int part1) { + uint32_t chs, maj, t; + simde_uint32x4_private + x_ = simde_uint32x4_to_private(x), + y_ = simde_uint32x4_to_private(y), + w_ = simde_uint32x4_to_private(w); + + for(int i = 0; i < 4; ++i) { + chs = simde_SHAchoose(y_.values[0], y_.values[1], y_.values[2]); + maj = simde_SHAmajority(x_.values[0], x_.values[1], x_.values[2]); + t = y_.values[3] + simde_SHAhashSIGMA1(y_.values[0]) + chs + w_.values[i]; + x_.values[3] = t + x_.values[3]; + y_.values[3] = t + simde_SHAhashSIGMA0(x_.values[0]) + maj; + uint32_t tmp = y_.values[3]; + y_.values[3] = 0x0 | y_.values[2]; + y_.values[2] = 0x0 | y_.values[1]; + y_.values[1] = 0x0 | y_.values[0]; + y_.values[0] = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return (part1 == 1) ? simde_uint32x4_from_private(x_) : simde_uint32x4_from_private(y_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256hq_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256hq_u32(hash_efgh, hash_abcd, wk); + #else + return x_simde_sha256hash(hash_efgh, hash_abcd, wk, 1); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha256hq_u32 + #define vsha256hq_u32(hash_efgh, hash_abcd, wk) simde_vsha256hq_u32((hash_efgh), (hash_abcd), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256h2q_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256h2q_u32(hash_efgh, hash_abcd, wk); + #else + return x_simde_sha256hash(hash_abcd, hash_efgh, wk, 0); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha256h2q_u32 + #define vsha256h2q_u32(hash_efgh, hash_abcd, wk) simde_vsha256h2q_u32((hash_efgh), (hash_abcd), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256su0q_u32(w0_3, w4_7); + #else + simde_uint32x4_private + r_, + T_, + x_ = simde_uint32x4_to_private(w0_3), + y_ = simde_uint32x4_to_private(w4_7); + T_.values[3] = y_.values[0]; + T_.values[2] = x_.values[3]; + T_.values[1] = x_.values[2]; + T_.values[0] = x_.values[1]; + uint32_t elt; + for(int i = 0; i < 4; ++i) { + elt = T_.values[i]; + elt = SIMDE_ROR32(elt, 7) ^ SIMDE_ROR32(elt, 18) ^ SIMDE_LSR(elt, 3); + r_.values[i] = elt + x_.values[i]; + } + return simde_uint32x4_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha256su0q_u32 + #define vsha256su0q_u32(w0_3, w4_7) simde_vsha256su0q_u32((w0_3), (w4_7)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t w8_11, simde_uint32x4_t w12_15) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256su1q_u32(tw0_3, w8_11, w12_15); + #else + simde_uint32x4_private + r_, + T0_, + x_ = simde_uint32x4_to_private(tw0_3), + y_ = simde_uint32x4_to_private(w8_11), + z_ = simde_uint32x4_to_private(w12_15); + simde_uint32x2_private T1_; + T0_.values[3] = z_.values[0]; + T0_.values[2] = y_.values[3]; + T0_.values[1] = y_.values[2]; + T0_.values[0] = y_.values[1]; + uint32_t elt; + T1_.values[1] = z_.values[3]; + T1_.values[0] = z_.values[2]; + for(int i = 0; i < 2; ++i) { + elt = T1_.values[i]; + elt = SIMDE_ROR32(elt, 17) ^ SIMDE_ROR32(elt, 19) ^ SIMDE_LSR(elt, 10); + elt = elt + x_.values[i] + T0_.values[i]; + r_.values[i] = elt; + } + T1_.values[1] = r_.values[1]; + T1_.values[0] = r_.values[0]; + for(int i = 2; i < 4; ++i) { + elt = T1_.values[i-2]; + elt = SIMDE_ROR32(elt, 17) ^ SIMDE_ROR32(elt, 19) ^ SIMDE_LSR(elt, 10); + elt = elt + x_.values[i] + T0_.values[i]; + r_.values[i] = elt; + } + return simde_uint32x4_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) + #undef vsha256su1q_u32 + #define vsha256su1q_u32(tw0_3, w8_11, w12_15) simde_vsha256su1q_u32((tw0_3), (w8_11), (w12_15)) +#endif + +#undef SIMDE_ROR32 +#undef SIMDE_ROL32 +#undef SIMDE_LSR +#undef SIMDE_LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA256_H) */ diff --git a/arm/neon/sha512.h b/arm/neon/sha512.h new file mode 100644 index 000000000..734cf34fe --- /dev/null +++ b/arm/neon/sha512.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA512_H) +#define SIMDE_ARM_NEON_SHA512_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define SIMDE_ROR64(operand, shift) (((operand) >> (shift)) | ((operand) << (64-shift))) +#define SIMDE_ROL64(operand, shift) (((operand) >> (64-shift)) | ((operand) << (shift))) +#define SIMDE_LSR(operand, shift) ((operand) >> (shift)) +#define SIMDE_LSL(operand, shift) ((operand) << (shift)) + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512hq_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512hq_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t Msigma1; + uint64_t tmp; + Msigma1 = SIMDE_ROR64(y_.values[1], 14) ^ SIMDE_ROR64(y_.values[1], 18) ^ SIMDE_ROR64(y_.values[1], 41); + r_.values[1] = (y_.values[1] & x_.values[0]) ^ (~(y_.values[1]) & x_.values[1]); + r_.values[1] = (r_.values[1] + Msigma1 + w_.values[1]); + tmp = r_.values[1] + y_.values[0]; + Msigma1 = SIMDE_ROR64(tmp, 14) ^ SIMDE_ROR64(tmp, 18) ^ SIMDE_ROR64(tmp, 41); + r_.values[0] = (tmp & y_.values[1]) ^ (~(tmp) & x_.values[0]); + r_.values[0] = (r_.values[0] + Msigma1 + w_.values[0]); + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) + #undef vsha512hq_u64 + #define vsha512hq_u64(w, x, y) simde_vsha512hq_u64((w), (x), (y)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512h2q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512h2q_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t Msigma0; + Msigma0 = SIMDE_ROR64(y_.values[0], 28) ^ SIMDE_ROR64(y_.values[0], 34) ^ SIMDE_ROR64(y_.values[0], 39); + r_.values[1] = (y_.values[1] & x_.values[0]) ^ (y_.values[0] & x_.values[0]) ^ (y_.values[1] & y_.values[0]); + r_.values[1] = (r_.values[1] + Msigma0 + w_.values[1]); + Msigma0 = SIMDE_ROR64(r_.values[1], 28) ^ SIMDE_ROR64(r_.values[1], 34) ^ SIMDE_ROR64(r_.values[1], 39); + r_.values[0] = (r_.values[1] & y_.values[0]) ^ (r_.values[1] & y_.values[1]) ^ (y_.values[1] & y_.values[0]); + r_.values[0] = (r_.values[0] + Msigma0 + w_.values[0]); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) + #undef vsha512h2q_u64 + #define vsha512h2q_u64(w, x, y) simde_vsha512h2q_u64((w), (x), (y)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512su0q_u64(simde_uint64x2_t w, simde_uint64x2_t x) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512su0q_u64(w, x); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x); + uint64_t sig0; + sig0 = SIMDE_ROR64(w_.values[1], 1) ^ SIMDE_ROR64(w_.values[1], 8) ^ (w_.values[1] >> 7); + r_.values[0] = w_.values[0] + sig0; + sig0 = SIMDE_ROR64(x_.values[0], 1) ^ SIMDE_ROR64(x_.values[0], 8) ^ (x_.values[0] >> 7); + r_.values[1] = w_.values[1] + sig0; + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) + #undef vsha512su0q_u64 + #define vsha512su0q_u64(w, x) simde_vsha512su0q_u64((w), (x)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512su1q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512su1q_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t sig1; + sig1 = SIMDE_ROR64(x_.values[1], 19) ^ SIMDE_ROR64(x_.values[1], 61) ^ (x_.values[1] >> 6); + r_.values[1] = w_.values[1] + sig1 + y_.values[1]; + sig1 = SIMDE_ROR64(x_.values[0], 19) ^ SIMDE_ROR64(x_.values[0], 61) ^ (x_.values[0] >> 6); + r_.values[0] = w_.values[0] + sig1 + y_.values[0]; + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) + #undef vsha512su1q_u64 + #define vsha512su1q_u64(w, x, y) simde_vsha512su1q_u64((w), (x), (y)) +#endif + +#undef SIMDE_ROR64 +#undef SIMDE_ROL64 +#undef SIMDE_LSR +#undef SIMDE_LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA512_H) */ diff --git a/arm/neon/shl.h b/arm/neon/shl.h index 3799fbab6..5a250dc9a 100644 --- a/arm/neon/shl.h +++ b/arm/neon/shl.h @@ -23,12 +23,14 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHL_H) #define SIMDE_ARM_NEON_SHL_H #include "types.h" +#include "../../x86/avx.h" /* Notes from the implementer (Christopher Moore aka rosbif) * @@ -99,7 +101,7 @@ SIMDE_FUNCTION_ATTRIBUTES uint64_t simde_vshld_u64 (const uint64_t a, const int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vshld_u64(a, HEDLEY_STATIC_CAST(uint64_t, b)); + return vshld_u64(a, HEDLEY_STATIC_CAST(int64_t, b)); #else int8_t b_ = HEDLEY_STATIC_CAST(int8_t, b); return @@ -140,15 +142,25 @@ simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t bit_shift_rst = __riscv_vmerge_vxm_i8m1( + __riscv_vsll_vv_i8m1 (a_.sv64, __riscv_vreinterpret_v_i8m1_u8m1(b_.sv64), 8), 0, __riscv_vmsge_vx_i8m1_b8(b_.sv64, 8, 8), 8); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv64, __riscv_vneg_v_i8m1 (b_.sv64, 8), 8); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + vint8m1_t scal_shift_rst = __riscv_vmerge_vvm_i8m1(__riscv_vsra_vv_i8m1 (a_.sv64, u_b_abs, 8), \ + __riscv_vsra_vx_i8m1(a_.sv64, 7, 8), __riscv_vmsle_vx_i8m1_b8(b_.sv64, -8, 8), 8); + r_.sv64 = __riscv_vmerge_vvm_i8m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8m1_b8 (b_.sv64, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int8x8_from_private(r_); @@ -179,14 +191,25 @@ simde_vshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv64, 4); + vint16m1_t bit_shift_rst = __riscv_vmerge_vxm_i16m1(__riscv_vsll_vv_i16m1 (a_.sv64, __riscv_vreinterpret_v_i16m1_u16m1(b_.sv64), 4), 0 \ + , __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 16, 8), 4); + vint16m1_t b_abs = __riscv_vmax_vv_i16m1 (b_.sv64, __riscv_vneg_v_i16m1 (b_.sv64, 4), 4); + vuint16m1_t u_b_abs = __riscv_vreinterpret_v_i16m1_u16m1 (b_abs); + vint16m1_t scal_shift_rst = __riscv_vmerge_vvm_i16m1(__riscv_vsra_vv_i16m1 (a_.sv64, u_b_abs, 4) + , __riscv_vsra_vx_i16m1(a_.sv64, 15, 4), __riscv_vmsle_vx_i8mf2_b16(b_8mf2, -16, 8), 4); + r_.sv64 = __riscv_vmerge_vvm_i16m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf2_b16 (b_8mf2, 0, 8), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int16x4_from_private(r_); @@ -217,14 +240,25 @@ simde_vshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (b_.values[i] >= 0) ? - (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv64, 2), 4); + vint32m1_t bit_shift_rst = __riscv_vmerge_vxm_i32m1(__riscv_vsll_vv_i32m1 (a_.sv64, __riscv_vreinterpret_v_i32m1_u32m1(b_.sv64), 2), 0 + , __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 32, 2), 2); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv64, __riscv_vneg_v_i32m1 (b_.sv64, 2), 2); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + vint32m1_t scal_shift_rst = __riscv_vmerge_vvm_i32m1(__riscv_vsra_vv_i32m1 (a_.sv64, u_b_abs, 2) + , __riscv_vsra_vx_i32m1(a_.sv64, 31, 2), __riscv_vmsle_vx_i8mf4_b32(b_8mf4, -32, 2), 2); + r_.sv64 = __riscv_vmerge_vvm_i32m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf4_b32 (b_8mf4, 0, 8), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (b_.values[i] >= 0) ? + (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_int32x2_from_private(r_); @@ -266,10 +300,21 @@ simde_vshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) { _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56))); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 (__riscv_vncvt_x_x_w_i16mf4 (__riscv_vncvt_x_x_w_i32mf2 (b_.sv64, 1), 2), 4); + vint64m1_t bit_shift_rst = __riscv_vmerge_vxm_i64m1(__riscv_vsll_vv_i64m1 (a_.sv64, __riscv_vreinterpret_v_i64m1_u64m1(b_.sv64), 1), 0 + , __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 64, 1), 1); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv64, __riscv_vneg_v_i64m1 (b_.sv64, 1), 1); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + vint64m1_t scal_shift_rst = __riscv_vmerge_vvm_i64m1(__riscv_vsra_vv_i64m1 (a_.sv64, u_b_abs, 1) + , __riscv_vsra_vx_i64m1(a_.sv64, 63, 1), __riscv_vmsle_vx_i8mf8_b64(b_8mf8, -64, 1), 1); + r_.sv64 = __riscv_vmerge_vvm_i64m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf8_b64 (b_8mf8, 0, 8), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_int64x1_from_private(r_); @@ -305,15 +350,24 @@ simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, - (simde_math_abs(b_.values[i]) >= 8) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t u_b = __riscv_vreinterpret_v_i8m1_u8m1 (b_.sv64); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv64, __riscv_vneg_v_i8m1 (b_.sv64, 8), 8); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vsrl_vv_u8m1(a_.sv64, u_b_abs, 8) + , __riscv_vsll_vv_u8m1 (a_.sv64, u_b, 8), __riscv_vmsge_vx_i8m1_b8(b_.sv64, 0, 8), 8), 0 \ + ,__riscv_vmsgeu_vx_u8m1_b8(u_b_abs, 8, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, + (simde_math_abs(b_.values[i]) >= 8) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint8x8_from_private(r_); @@ -344,14 +398,26 @@ simde_vshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, - (simde_math_abs(b_.values[i]) >= 16) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv64, 4); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 8), 8); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2 (b_8mf2_abs); + vuint16m1_t u_b = __riscv_vreinterpret_v_i16m1_u16m1 (b_.sv64); + vint16m1_t b_abs = __riscv_vmax_vv_i16m1 (b_.sv64, __riscv_vneg_v_i16m1 (b_.sv64, 4), 4); + vuint16m1_t u_b_abs = __riscv_vreinterpret_v_i16m1_u16m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vsrl_vv_u16m1(a_.sv64, u_b_abs, 4) + , __riscv_vsll_vv_u16m1 (a_.sv64, u_b, 4), __riscv_vmsge_vx_i16m1_b16(b_.sv64, 0, 4), 4) + , 0, __riscv_vmsgeu_vx_u8mf2_b16(u_b_8mf2_abs, 16, 8), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, + (simde_math_abs(b_.values[i]) >= 16) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint16x4_from_private(r_); @@ -382,14 +448,26 @@ simde_vshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv64, 2), 4); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 8), 8); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b = __riscv_vreinterpret_v_i32m1_u32m1 (b_.sv64); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv64, __riscv_vneg_v_i32m1 (b_.sv64, 2), 2); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vsrl_vv_u32m1(a_.sv64, u_b_abs, 2) + , __riscv_vsll_vv_u32m1 (a_.sv64, u_b, 2), __riscv_vmsge_vx_i32m1_b32(b_.sv64, 0, 2), 2), 0 + , __riscv_vmsgeu_vx_u8mf4_b32(u_b_8mf4_abs, 32, 8), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (simde_math_abs(b_.values[i]) >= 32) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_uint32x2_from_private(r_); @@ -429,10 +507,24 @@ simde_vshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) { _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56))); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + // change b_ to int8_t + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8(__riscv_vncvt_x_x_w_i16mf4 \ + (__riscv_vncvt_x_x_w_i32mf2 (b_.sv64, 1), 2), 4); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8(b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 8), 8); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b = __riscv_vreinterpret_v_i64m1_u64m1 (b_.sv64); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv64, __riscv_vneg_v_i64m1 (b_.sv64, 1), 1); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u64m1(__riscv_vmerge_vvm_u64m1(__riscv_vsrl_vv_u64m1(a_.sv64, u_b_abs, 1) + , __riscv_vsll_vv_u64m1 (a_.sv64, u_b, 1), __riscv_vmsge_vx_i64m1_b64(b_.sv64, 0, 1), 1), 0 + , __riscv_vmsgeu_vx_u8mf8_b64(u_b_8mf8_abs, 64, 8), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_uint64x1_from_private(r_); @@ -476,13 +568,23 @@ simde_vshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256)); r_.m128i = _mm256_cvtepi16_epi8(r256); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t bit_shift_rst = __riscv_vmerge_vxm_i8m1(__riscv_vsll_vv_i8m1 (a_.sv128, __riscv_vreinterpret_v_i8m1_u8m1(b_.sv128), 16), \ + 0, __riscv_vmsge_vx_i8m1_b8(b_.sv128, 8, 16), 16); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv128, __riscv_vneg_v_i8m1 (b_.sv128, 16), 16); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + vint8m1_t scal_shift_rst = __riscv_vmerge_vvm_i8m1(__riscv_vsra_vv_i8m1 (a_.sv128, u_b_abs, 16) + , __riscv_vsra_vx_i8m1(a_.sv128, 7, 16), __riscv_vmsle_vx_i8m1_b8(b_.sv128, -8, 16), 16); + r_.sv128 = __riscv_vmerge_vvm_i8m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8m1_b8 (b_.sv128, 0, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int8x16_from_private(r_); @@ -533,16 +635,28 @@ simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv128, 8); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 16), 16); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2(b_8mf2_abs); + vuint16m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u16m1 (u_b_8mf2_abs, 16); + vint16m1_t bit_shift_rst = __riscv_vmerge_vxm_i16m1(__riscv_vsll_vv_i16m1 (a_.sv128, __riscv_vreinterpret_v_i16m1_u16m1(b_.sv128), 8), 0, \ + __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 16, 16), 8); + vint16m1_t scal_shift_rst = __riscv_vmerge_vvm_i16m1(__riscv_vsra_vv_i16m1 (a_.sv128, u_b_abs, 8), + __riscv_vsra_vx_i16m1(a_.sv128, 15, 8), __riscv_vmsle_vx_i8mf2_b16(b_8mf2, -16, 16), 8); + r_.sv128 = __riscv_vmerge_vvm_i16m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf2_b16 (b_8mf2, 0, 16), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int16x8_from_private(r_); @@ -587,14 +701,26 @@ simde_vshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { _mm_srav_epi32(a_.m128i, _mm_abs_epi32(B)), _mm_cmpgt_epi32(_mm_setzero_si128(), B)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (b_.values[i] >= 0) ? - (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv128, 4), 8); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 16), 16); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwcvtu_x_x_v_u16mf2 (u_b_8mf4_abs, 16), 8); + vint32m1_t bit_shift_rst = __riscv_vmerge_vxm_i32m1(__riscv_vsll_vv_i32m1 (a_.sv128, __riscv_vreinterpret_v_i32m1_u32m1(b_.sv128), 4), 0, + __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 32, 16), 4); + vint32m1_t scal_shift_rst = __riscv_vmerge_vvm_i32m1(__riscv_vsra_vv_i32m1 (a_.sv128, u_b_abs, 4), \ + __riscv_vsra_vx_i32m1(a_.sv128, 31, 4), __riscv_vmsle_vx_i8mf4_b32(b_8mf4, -32, 4), 4); + r_.sv128 = __riscv_vmerge_vvm_i32m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf4_b32 (b_8mf4, 0, 16), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (b_.values[i] >= 0) ? + (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_int32x4_from_private(r_); @@ -648,10 +774,21 @@ simde_vshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) { _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a_.m128i, maska), b_abs), maska), _mm_cmpgt_epi64(zero, _mm_slli_epi64(b_.m128i, 56))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 (__riscv_vncvt_x_x_w_i16mf4 (__riscv_vncvt_x_x_w_i32mf2 (b_.sv128, 2), 4), 8); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8 (b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 16), 16); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u64m1(__riscv_vwcvtu_x_x_v_u32mf2 (__riscv_vwcvtu_x_x_v_u16mf4(u_b_8mf8_abs, 16), 8), 4); + vint64m1_t bit_shift_rst = __riscv_vmerge_vxm_i64m1(__riscv_vsll_vv_i64m1 (a_.sv128, __riscv_vreinterpret_v_i64m1_u64m1(b_.sv128), 2), 0, __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 64, 2), 2); + vint64m1_t scal_shift_rst = __riscv_vmerge_vvm_i64m1(__riscv_vsra_vv_i64m1 (a_.sv128, u_b_abs, 2) + , __riscv_vsra_vx_i64m1(a_.sv128, 63, 2), __riscv_vmsle_vx_i8mf8_b64(b_8mf8, -64, 2), 2); + r_.sv128 = __riscv_vmerge_vvm_i64m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf8_b64 (b_8mf8, 0, 16), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_int64x2_from_private(r_); @@ -688,13 +825,22 @@ simde_vshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) { _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256)); r_.m128i = _mm256_cvtepi16_epi8(r256); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, - (simde_math_abs(b_.values[i]) >= 8) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t u_b = __riscv_vreinterpret_v_i8m1_u8m1 (b_.sv128); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv128, __riscv_vneg_v_i8m1 (b_.sv128, 16), 16); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vsrl_vv_u8m1(a_.sv128, u_b_abs, 16) + , __riscv_vsll_vv_u8m1 (a_.sv128, u_b, 16), __riscv_vmsge_vx_i8m1_b8(b_.sv128, 0, 16), 16), 0 + , __riscv_vmsgeu_vx_u8m1_b8(u_b_abs, 8, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, + (simde_math_abs(b_.values[i]) >= 8) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint8x16_from_private(r_); @@ -743,16 +889,27 @@ simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, - (simde_math_abs(b_.values[i]) >= 16) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv128, 8); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 16), 16); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2 (b_8mf2_abs); + vuint16m1_t u_b = __riscv_vreinterpret_v_i16m1_u16m1 (b_.sv128); + vuint16m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u16m1 (u_b_8mf2_abs, 16); + r_.sv128 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vsrl_vv_u16m1(a_.sv128, u_b_abs, 8), + __riscv_vsll_vv_u16m1 (a_.sv128, u_b, 8), __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 0, 8), 8), + 0, __riscv_vmsgeu_vx_u8mf2_b16(u_b_8mf2_abs, 16, 16), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, + (simde_math_abs(b_.values[i]) >= 16) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint16x8_from_private(r_); @@ -789,13 +946,26 @@ simde_vshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) { _mm_srlv_epi32(a_.m128i, _mm_abs_epi32(B)), _mm_cmpgt_epi32(_mm_setzero_si128(), B)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 ( + __riscv_vncvt_x_x_w_i16mf2 (b_.sv128, 4), 8); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 16), 16); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b = __riscv_vreinterpret_v_i32m1_u32m1 (b_.sv128); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv128, __riscv_vneg_v_i32m1 (b_.sv128, 4), 4); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vsrl_vv_u32m1(a_.sv128, u_b_abs, 4) + , __riscv_vsll_vv_u32m1 (a_.sv128, u_b, 4), __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 0, 4), 4), 0 + , __riscv_vmsgeu_vx_u8mf4_b32(u_b_8mf4_abs, 32, 16), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = (simde_math_abs(b_.values[i]) >= 32) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_uint32x4_from_private(r_); @@ -844,10 +1014,24 @@ simde_vshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) { _mm_srlv_epi64(a_.m128i, b_abs), _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b_.m128i, 56))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 ( + __riscv_vncvt_x_x_w_i16mf4 ( + __riscv_vncvt_x_x_w_i32mf2 (b_.sv128, 2), 4), 8); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8 (b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 16), 16); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b = __riscv_vreinterpret_v_i64m1_u64m1 (b_.sv128); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv128, __riscv_vneg_v_i64m1 (b_.sv128, 2), 2); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u64m1(__riscv_vmerge_vvm_u64m1(__riscv_vsrl_vv_u64m1(a_.sv128, u_b_abs, 2) + , __riscv_vsll_vv_u64m1 (a_.sv128, u_b, 2), __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 0, 2), 2), 0 + , __riscv_vmsgeu_vx_u8mf8_b64(u_b_8mf8_abs, 64, 16), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_uint64x2_from_private(r_); diff --git a/arm/neon/shl_n.h b/arm/neon/shl_n.h index 61fb143a8..c80cf9f24 100644 --- a/arm/neon/shl_n.h +++ b/arm/neon/shl_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHL_N_H) @@ -69,8 +70,9 @@ simde_vshl_n_s8 (const simde_int8x8_t a, const int n) simde_int8x8_private r_, a_ = simde_int8x8_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values << HEDLEY_STATIC_CAST(int8_t, n); #else SIMDE_VECTORIZE @@ -100,7 +102,9 @@ simde_vshl_n_s16 (const simde_int16x4_t a, const int n) r_, a_ = simde_int16x4_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int16_t, n); #else SIMDE_VECTORIZE @@ -129,7 +133,9 @@ simde_vshl_n_s32 (const simde_int32x2_t a, const int n) r_, a_ = simde_int32x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -158,7 +164,9 @@ simde_vshl_n_s64 (const simde_int64x1_t a, const int n) r_, a_ = simde_int64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -187,7 +195,9 @@ simde_vshl_n_u8 (const simde_uint8x8_t a, const int n) r_, a_ = simde_uint8x8_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values << HEDLEY_STATIC_CAST(uint8_t, n); #else SIMDE_VECTORIZE @@ -217,7 +227,9 @@ simde_vshl_n_u16 (const simde_uint16x4_t a, const int n) r_, a_ = simde_uint16x4_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint16_t, n); #else SIMDE_VECTORIZE @@ -246,7 +258,9 @@ simde_vshl_n_u32 (const simde_uint32x2_t a, const int n) r_, a_ = simde_uint32x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -275,7 +289,9 @@ simde_vshl_n_u64 (const simde_uint64x1_t a, const int n) r_, a_ = simde_uint64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -311,6 +327,8 @@ simde_vshlq_n_s8 (const simde_int8x16_t a, const int n) r_.m128i = _mm_andnot_si128(_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << n) - 1)), _mm_slli_epi64(a_.m128i, n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i8m1 (a_.sv128, n, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int8_t, n); #else @@ -344,6 +362,8 @@ simde_vshlq_n_s16 (const simde_int16x8_t a, const int n) r_.m128i = _mm_slli_epi16(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i16m1 (a_.sv128, n, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int16_t, n); #else @@ -377,6 +397,8 @@ simde_vshlq_n_s32 (const simde_int32x4_t a, const int n) r_.m128i = _mm_slli_epi32(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i32m1 (a_.sv128, n, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -410,6 +432,8 @@ simde_vshlq_n_s64 (const simde_int64x2_t a, const int n) r_.m128i = _mm_slli_epi64(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i64m1 (a_.sv128, n, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -446,6 +470,8 @@ simde_vshlq_n_u8 (const simde_uint8x16_t a, const int n) r_.m128i = _mm_andnot_si128(_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << n) - 1)), _mm_slli_epi64(a_.m128i, (n))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u8m1 (a_.sv128, n, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint8_t, n); #else @@ -479,6 +505,8 @@ simde_vshlq_n_u16 (const simde_uint16x8_t a, const int n) r_.m128i = _mm_slli_epi16(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u16m1 (a_.sv128, n, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint16_t, n); #else @@ -512,6 +540,8 @@ simde_vshlq_n_u32 (const simde_uint32x4_t a, const int n) r_.m128i = _mm_slli_epi32(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u32m1 (a_.sv128, n, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -545,6 +575,8 @@ simde_vshlq_n_u64 (const simde_uint64x2_t a, const int n) r_.m128i = _mm_slli_epi64(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u64m1 (a_.sv128, n, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else diff --git a/arm/neon/shll_high_n.h b/arm/neon/shll_high_n.h new file mode 100644 index 000000000..962d409a5 --- /dev/null +++ b/arm/neon/shll_high_n.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHLL_HIGH_N_H) +#define SIMDE_ARM_NEON_SHLL_HIGH_N_H + +#include "types.h" + +/* + * The constant range requirements for the shift amount *n* looks strange. + * The ARM Neon Intrinsics Reference states that for *_s8, 0 << n << 7. This + * does not match the actual instruction decoding in the ARM Reference manual, + * which states that the shift amount "must be equal to the source element width + * in bits" (ARM DDI 0487F.b C7-1959). So for *_s8 instructions, *n* must be 8, + * for *_s16, it must be 16, and *_s32 must be 32 (similarly for unsigned). + */ + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vshll_high_n_s8 (const simde_int8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n); + } + + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s8(a, n) vshll_high_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s8 + #define vshll_high_n_s8(a, n) simde_vshll_high_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vshll_high_n_s16 (const simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s16(a, n) vshll_high_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s16 + #define vshll_high_n_s16(a, n) simde_vshll_high_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vshll_high_n_s32 (const simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s32(a, n) vshll_high_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s32 + #define vshll_high_n_s32(a, n) simde_vshll_high_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vshll_high_n_u8 (const simde_uint8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u8(a, n) vshll_high_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u8 + #define vshll_high_n_u8(a, n) simde_vshll_high_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vshll_high_n_u16 (const simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u16(a, n) vshll_high_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u16 + #define vshll_high_n_u16(a, n) simde_vshll_high_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vshll_high_n_u32 (const simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u32(a, n) vshll_high_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u32 + #define vshll_high_n_u32(a, n) simde_vshll_high_n_u32((a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHLL_HIGH_N_H) */ diff --git a/arm/neon/shll_n.h b/arm/neon/shll_n.h index 36fb96eaa..e8eaca0d4 100644 --- a/arm/neon/shll_n.h +++ b/arm/neon/shll_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHLL_N_H) @@ -46,15 +47,22 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_int16x8_t simde_vshll_n_s8 (const simde_int8x8_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_int16x8_private r_; simde_int8x8_private a_ = simde_int8x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i]) << n); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t va_wide = __riscv_vwcvt_x_x_v_i16m2 (a_.sv64, 8); + vint16m2_t rst = __riscv_vsll_vx_i16m2 (va_wide, n, 8); + r_.sv128 = __riscv_vlmul_trunc_v_i16m2_i16m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_i16x8_load8x8(&a_.values); + r_.v128 = wasm_i16x8_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i]) << n); + } + #endif return simde_int16x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -68,15 +76,22 @@ simde_vshll_n_s8 (const simde_int8x8_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vshll_n_s16 (const simde_int16x4_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_int32x4_private r_; simde_int16x4_private a_ = simde_int16x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t va_wide = __riscv_vwcvt_x_x_v_i32m2 (a_.sv64, 4); + vint32m2_t rst = __riscv_vsll_vx_i32m2 (va_wide, n, 4); + r_.sv128 = __riscv_vlmul_trunc_v_i32m2_i32m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_i32x4_load16x4(&a_.values); + r_.v128 = wasm_i32x4_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) << n; + } + #endif return simde_int32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -90,15 +105,22 @@ simde_vshll_n_s16 (const simde_int16x4_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vshll_n_s32 (const simde_int32x2_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_int64x2_private r_; simde_int32x2_private a_ = simde_int32x2_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t va_wide = __riscv_vwcvt_x_x_v_i64m2 (a_.sv64, 2); + vint64m2_t rst = __riscv_vsll_vx_i64m2 (va_wide, n, 2); + r_.sv128 = __riscv_vlmul_trunc_v_i64m2_i64m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_i64x2_load32x2(&a_.values); + r_.v128 = wasm_i64x2_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) << n; + } + #endif return simde_int64x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -112,15 +134,22 @@ simde_vshll_n_s32 (const simde_int32x2_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vshll_n_u8 (const simde_uint8x8_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_uint16x8_private r_; simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i]) << n); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m2_t va_wide = __riscv_vwcvtu_x_x_v_u16m2 (a_.sv64, 8); + vuint16m2_t rst = __riscv_vsll_vx_u16m2 (va_wide, n, 8); + r_.sv128 = __riscv_vlmul_trunc_v_u16m2_u16m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_u16x8_load8x8(&a_.values); + r_.v128 = wasm_i16x8_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i]) << n); + } + #endif return simde_uint16x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -134,15 +163,22 @@ simde_vshll_n_u8 (const simde_uint8x8_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vshll_n_u16 (const simde_uint16x4_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_uint32x4_private r_; simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m2_t va_wide = __riscv_vwcvtu_x_x_v_u32m2 (a_.sv64, 4); + vuint32m2_t rst = __riscv_vsll_vx_u32m2 (va_wide, n, 4); + r_.sv128 = __riscv_vlmul_trunc_v_u32m2_u32m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_u32x4_load16x4(&a_.values); + r_.v128 = wasm_i32x4_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) << n; + } + #endif return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -156,15 +192,22 @@ simde_vshll_n_u16 (const simde_uint16x4_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vshll_n_u32 (const simde_uint32x2_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_uint64x2_private r_; simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m2_t va_wide = __riscv_vwcvtu_x_x_v_u64m2 (a_.sv64, 2); + vuint64m2_t rst = __riscv_vsll_vx_u64m2 (va_wide, n, 2); + r_.sv128 = __riscv_vlmul_trunc_v_u64m2_u64m1 (rst); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_u64x2_load32x2(&a_.values); + r_.v128 = wasm_i64x2_shl(tmp, HEDLEY_STATIC_CAST(uint32_t, n)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i]) << n; + } + #endif return simde_uint64x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) diff --git a/arm/neon/shr_n.h b/arm/neon/shr_n.h index 5c912571e..aeb4360d5 100644 --- a/arm/neon/shr_n.h +++ b/arm/neon/shr_n.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHR_N_H) @@ -34,6 +36,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_x_vshrh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return a >> ((n == 16) ? 15 : n); +} + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_x_vshrh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return (n == 16) ? 0 : a >> n; +} + SIMDE_FUNCTION_ATTRIBUTES int32_t simde_x_vshrs_n_s32(int32_t a, const int n) @@ -85,7 +101,9 @@ simde_vshr_n_s8 (const simde_int8x8_t a, const int n) a_ = simde_int8x8_to_private(a); int32_t n_ = (n == 8) ? 7 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i8m1 (a_.sv64, n_, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -118,7 +136,9 @@ simde_vshr_n_s16 (const simde_int16x4_t a, const int n) a_ = simde_int16x4_to_private(a); int32_t n_ = (n == 16) ? 15 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i16m1 (a_.sv64, n_, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -148,7 +168,9 @@ simde_vshr_n_s32 (const simde_int32x2_t a, const int n) a_ = simde_int32x2_to_private(a); int32_t n_ = (n == 32) ? 31 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i32m1 (a_.sv64, n_, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -178,7 +200,9 @@ simde_vshr_n_s64 (const simde_int64x1_t a, const int n) a_ = simde_int64x1_to_private(a); int32_t n_ = (n == 64) ? 63 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i64m1 (a_.sv64, n_, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -208,7 +232,9 @@ simde_vshr_n_u8 (const simde_uint8x8_t a, const int n) if (n == 8) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -242,7 +268,9 @@ simde_vshr_n_u16 (const simde_uint16x4_t a, const int n) if (n == 16) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -275,7 +303,9 @@ simde_vshr_n_u32 (const simde_uint32x2_t a, const int n) if (n == 32) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -308,7 +338,9 @@ simde_vshr_n_u64 (const simde_uint64x1_t a, const int n) if (n == 64) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -354,6 +386,9 @@ simde_vshrq_n_s8 (const simde_int8x16_t a, const int n) _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_srai_epi16(_mm_slli_epi16(a_.m128i, 8), 8 + (n)))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shr(a_.v128, ((n) == 8) ? 7 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 8) ? 7 : n; + r_.sv128 = __riscv_vsra_vx_i8m1 (a_.sv128, n_, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 8) ? 7 : n); #else @@ -387,6 +422,9 @@ simde_vshrq_n_s16 (const simde_int16x8_t a, const int n) r_.m128i = _mm_srai_epi16(a_.m128i, n); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shr(a_.v128, ((n) == 16) ? 15 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 16) ? 15 : n; + r_.sv128 = __riscv_vsra_vx_i16m1 (a_.sv128, n_, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 16) ? 15 : n); #else @@ -420,6 +458,9 @@ simde_vshrq_n_s32 (const simde_int32x4_t a, const int n) r_.m128i = _mm_srai_epi32(a_.m128i, n); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shr(a_.v128, ((n) == 32) ? 31 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 32) ? 31 : n; + r_.sv128 = __riscv_vsra_vx_i32m1 (a_.sv128, n_, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 32) ? 31 : n); #else @@ -452,6 +493,9 @@ simde_vshrq_n_s64 (const simde_int64x2_t a, const int n) #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shr(a_.v128, ((n) == 64) ? 63 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 64) ? 63 : n; + r_.sv128 = __riscv_vsra_vx_i64m1 (a_.sv128, n_, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 64) ? 63 : n); #else @@ -493,7 +537,9 @@ simde_vshrq_n_u8 (const simde_uint8x16_t a, const int n) if (n == 8) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u8m1 (a_.sv128, n, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -533,7 +579,9 @@ simde_vshrq_n_u16 (const simde_uint16x8_t a, const int n) if (n == 16) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u16m1 (a_.sv128, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -573,7 +621,9 @@ simde_vshrq_n_u32 (const simde_uint32x4_t a, const int n) if (n == 32) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u32m1 (a_.sv128, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -613,7 +663,9 @@ simde_vshrq_n_u64 (const simde_uint64x2_t a, const int n) if (n == 64) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u64m1 (a_.sv128, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) r_.values = a_.values >> n; #else SIMDE_VECTORIZE diff --git a/arm/neon/shrn_high_n.h b/arm/neon/shrn_high_n.h new file mode 100644 index 000000000..141ab6307 --- /dev/null +++ b/arm/neon/shrn_high_n.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_SHRN_HIGH_N_H + +#include "types.h" +#include "reinterpret.h" +#include "combine.h" +#include "shrn_n.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s16(r, a, n) vshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vshrn_high_n_s16(r, a, n) \ + simde_vcombine_s8((r), simde_vshrn_n_s16((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s16 + #define vshrn_high_n_s16(r, a, n) simde_vshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s32(r, a, n) vshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vshrn_high_n_s32(r, a, n) \ + simde_vcombine_s16((r), simde_vshrn_n_s32((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s32 + #define vshrn_high_n_s32(r, a, n) simde_vshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s64(r, a, n) vshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vshrn_high_n_s64(r, a, n) \ + simde_vcombine_s32((r), simde_vshrn_n_s64((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s64 + #define vshrn_high_n_s64(r, a, n) simde_vshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u16(r, a, n) vshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vshrn_high_n_u16(r, a, n) \ + simde_vreinterpretq_u8_s8( \ + simde_vcombine_s8(simde_vreinterpret_s8_u8(r), \ + simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u16 + #define vshrn_high_n_u16(r, a, n) simde_vshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u32(r, a, n) vshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vshrn_high_n_u32(r, a, n) \ + simde_vreinterpretq_u16_s16( \ + simde_vcombine_s16(simde_vreinterpret_s16_u16(r), \ + simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u32 + #define vshrn_high_n_u32(r, a, n) simde_vshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u64(r, a, n) vshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vshrn_high_n_u64(r, a, n) \ + simde_vreinterpretq_u32_s32( \ + simde_vcombine_s32(simde_vreinterpret_s32_u32(r), \ + simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u64 + #define vshrn_high_n_u64(r, a, n) simde_vshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHRN_HIGH_N_H) */ diff --git a/arm/neon/shrn_n.h b/arm/neon/shrn_n.h index 6e890b431..ae797d8d0 100644 --- a/arm/neon/shrn_n.h +++ b/arm/neon/shrn_n.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHRN_N_H) @@ -43,10 +45,16 @@ simde_vshrn_n_s16 (const simde_int16x8_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { simde_int8x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (a_.values[i] >> n) & UINT8_MAX); - } + + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t shift = __riscv_vand_vx_i16m1(__riscv_vsll_vx_i16m1 (a_.sv128, n, 8), UINT8_MAX, 8); + r_.sv64 = __riscv_vlmul_ext_v_i8mf2_i8m1(__riscv_vncvt_x_x_w_i8mf2(shift, 8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (a_.values[i] >> n) & UINT8_MAX); + } + #endif return simde_int8x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -65,12 +73,15 @@ simde_vshrn_n_s32 (const simde_int32x4_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { simde_int16x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] >> n) & UINT16_MAX); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t shift = __riscv_vand_vx_i32m1(__riscv_vsll_vx_i32m1 (a_.sv128, n, 4), UINT16_MAX, 4); + r_.sv64 = __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vncvt_x_x_w_i16mf2(shift, 4)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] >> n) & UINT16_MAX); + } + #endif return simde_int16x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -90,11 +101,15 @@ simde_vshrn_n_s64 (const simde_int64x2_t a, const int n) simde_int32x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (a_.values[i] >> n) & UINT32_MAX); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t shift = __riscv_vand_vx_i64m1(__riscv_vsll_vx_i64m1 (a_.sv128, n, 2), UINT32_MAX, 2); + r_.sv64 = __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vncvt_x_x_w_i32mf2(shift, 2)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (a_.values[i] >> n) & UINT32_MAX); + } + #endif return simde_int32x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -107,40 +122,36 @@ simde_vshrn_n_s64 (const simde_int64x2_t a, const int n) #define vshrn_n_s64(a, n) simde_vshrn_n_s64((a), (n)) #endif -#define simde_vshrn_n_u16(a, n) \ - simde_vreinterpret_u8_s8( \ - simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u16 #define simde_vshrn_n_u16(a, n) vshrn_n_u16((a), (n)) +#else + #define simde_vshrn_n_u16(a, n) \ + simde_vreinterpret_u8_s8( \ + simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n))) #endif - #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u16 #define vshrn_n_u16(a, n) simde_vshrn_n_u16((a), (n)) #endif -#define simde_vshrn_n_u32(a, n) \ - simde_vreinterpret_u16_s16( \ - simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u32 #define simde_vshrn_n_u32(a, n) vshrn_n_u32((a), (n)) +#else + #define simde_vshrn_n_u32(a, n) \ + simde_vreinterpret_u16_s16( \ + simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u32 #define vshrn_n_u32(a, n) simde_vshrn_n_u32((a), (n)) #endif -#define simde_vshrn_n_u64(a, n) \ - simde_vreinterpret_u32_s32( \ - simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u64 #define simde_vshrn_n_u64(a, n) vshrn_n_u64((a), (n)) +#else + #define simde_vshrn_n_u64(a, n) \ + simde_vreinterpret_u32_s32( \ + simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u64 diff --git a/arm/neon/sli_n.h b/arm/neon/sli_n.h new file mode 100644 index 000000000..889bafd99 --- /dev/null +++ b/arm/neon/sli_n.h @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SLI_N_H) +#define SIMDE_ARM_NEON_SLI_N_H + +#include "types.h" +#include "shl_n.h" +#include "dup_n.h" +#include "and.h" +#include "orr.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vslid_n_s64(a, b, n) vslid_n_s64(a, b, n) +#else + #define simde_vslid_n_s64(a, b, n) \ + HEDLEY_STATIC_CAST(int64_t, \ + simde_vslid_n_u64(HEDLEY_STATIC_CAST(uint64_t, a), HEDLEY_STATIC_CAST(uint64_t, b), n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vslid_n_s64 + #define vslid_n_s64(a, b, n) simde_vslid_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vslid_n_u64(a, b, n) vslid_n_u64(a, b, n) +#else +#define simde_vslid_n_u64(a, b, n) \ + (((a & (UINT64_C(0x7fffffffffffffff) >> (63 - n))) | simde_vshld_n_u64((b), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vslid_n_u64 + #define vslid_n_u64(a, b, n) simde_vslid_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s8(a, b, n) vsli_n_s8((a), (b), (n)) +#else + #define simde_vsli_n_s8(a, b, n) \ + simde_vreinterpret_s8_u8(simde_vsli_n_u8( \ + simde_vreinterpret_u8_s8((a)), simde_vreinterpret_u8_s8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s8 + #define vsli_n_s8(a, b, n) simde_vsli_n_s8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u8(a, b, n) vsli_n_u8((a), (b), (n)) +#else + #define simde_vsli_n_u8(a, b, n) \ + simde_vorr_u8( \ + simde_vand_u8((a), simde_vdup_n_u8((UINT8_C(0xff) >> (8 - n)))), \ + simde_vshl_n_u8((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u8 + #define vsli_n_u8(a, b, n) simde_vsli_n_u8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s16(a, b, n) vsli_n_s16((a), (b), (n)) +#else + #define simde_vsli_n_s16(a, b, n) \ + simde_vreinterpret_s16_u16(simde_vsli_n_u16( \ + simde_vreinterpret_u16_s16((a)), simde_vreinterpret_u16_s16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s16 + #define vsli_n_s16(a, b, n) simde_vsli_n_s16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u16(a, b, n) vsli_n_u16((a), (b), (n)) +#else + #define simde_vsli_n_u16(a, b, n) \ + simde_vorr_u16( \ + simde_vand_u16((a), simde_vdup_n_u16((UINT16_C(0x7fff) >> (15 - n)))), \ + simde_vshl_n_u16((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u16 + #define vsli_n_u16(a, b, n) simde_vsli_n_u16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s32(a, b, n) vsli_n_s32((a), (b), (n)) +#else + #define simde_vsli_n_s32(a, b, n) \ + simde_vreinterpret_s32_u32(simde_vsli_n_u32( \ + simde_vreinterpret_u32_s32((a)), simde_vreinterpret_u32_s32((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s32 + #define vsli_n_s32(a, b, n) simde_vsli_n_s32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u32(a, b, n) vsli_n_u32((a), (b), (n)) +#else + #define simde_vsli_n_u32(a, b, n) \ + simde_vorr_u32( \ + simde_vand_u32((a), \ + simde_vdup_n_u32((UINT32_C(0x7fffffff) >> (31 - n)))), \ + simde_vshl_n_u32((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u32 + #define vsli_n_u32(a, b, n) simde_vsli_n_u32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s64(a, b, n) vsli_n_s64((a), (b), (n)) +#else + #define simde_vsli_n_s64(a, b, n) \ + simde_vreinterpret_s64_u64(simde_vsli_n_u64( \ + simde_vreinterpret_u64_s64((a)), simde_vreinterpret_u64_s64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s64 + #define vsli_n_s64(a, b, n) simde_vsli_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u64(a, b, n) vsli_n_u64((a), (b), (n)) +#else +#define simde_vsli_n_u64(a, b, n) \ + simde_vorr_u64( \ + simde_vand_u64((a), simde_vdup_n_u64( \ + (UINT64_C(0x7fffffffffffffff) >> (63 - n)))), \ + simde_vshl_n_u64((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u64 + #define vsli_n_u64(a, b, n) simde_vsli_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s8(a, b, n) vsliq_n_s8((a), (b), (n)) +#else + #define simde_vsliq_n_s8(a, b, n) \ + simde_vreinterpretq_s8_u8(simde_vsliq_n_u8( \ + simde_vreinterpretq_u8_s8((a)), simde_vreinterpretq_u8_s8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s8 + #define vsliq_n_s8(a, b, n) simde_vsliq_n_s8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u8(a, b, n) vsliq_n_u8((a), (b), (n)) +#else + #define simde_vsliq_n_u8(a, b, n) \ + simde_vorrq_u8( \ + simde_vandq_u8((a), simde_vdupq_n_u8((UINT8_C(0x7f) >> (7 - n)))), \ + simde_vshlq_n_u8((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u8 + #define vsliq_n_u8(a, b, n) simde_vsliq_n_u8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s16(a, b, n) vsliq_n_s16((a), (b), (n)) +#else + #define simde_vsliq_n_s16(a, b, n) \ + simde_vreinterpretq_s16_u16(simde_vsliq_n_u16( \ + simde_vreinterpretq_u16_s16((a)), simde_vreinterpretq_u16_s16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s16 + #define vsliq_n_s16(a, b, n) simde_vsliq_n_s16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u16(a, b, n) vsliq_n_u16((a), (b), (n)) +#else + #define simde_vsliq_n_u16(a, b, n) \ + simde_vorrq_u16( \ + simde_vandq_u16((a), simde_vdupq_n_u16((UINT16_C(0x7fff) >> (15 - n)))), \ + simde_vshlq_n_u16((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u16 + #define vsliq_n_u16(a, b, n) simde_vsliq_n_u16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s32(a, b, n) vsliq_n_s32((a), (b), (n)) +#else + #define simde_vsliq_n_s32(a, b, n) \ + simde_vreinterpretq_s32_u32(simde_vsliq_n_u32( \ + simde_vreinterpretq_u32_s32((a)), simde_vreinterpretq_u32_s32((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s32 + #define vsliq_n_s32(a, b, n) simde_vsliq_n_s32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u32(a, b, n) vsliq_n_u32((a), (b), (n)) +#else + #define simde_vsliq_n_u32(a, b, n) \ + simde_vorrq_u32( \ + simde_vandq_u32((a), \ + simde_vdupq_n_u32((UINT32_C(0x7fffffff) >> (31 - n)))), \ + simde_vshlq_n_u32((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u32 + #define vsliq_n_u32(a, b, n) simde_vsliq_n_u32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s64(a, b, n) vsliq_n_s64((a), (b), (n)) +#else + #define simde_vsliq_n_s64(a, b, n) \ + simde_vreinterpretq_s64_u64(simde_vsliq_n_u64( \ + simde_vreinterpretq_u64_s64((a)), simde_vreinterpretq_u64_s64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s64 + #define vsliq_n_s64(a, b, n) simde_vsliq_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u64(a, b, n) vsliq_n_u64((a), (b), (n)) +#else +#define simde_vsliq_n_u64(a, b, n) \ + simde_vorrq_u64( \ + simde_vandq_u64((a), simde_vdupq_n_u64( \ + (UINT64_C(0x7fffffffffffffff) >> (63 - n)))), \ + simde_vshlq_n_u64((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u64 + #define vsliq_n_u64(a, b, n) simde_vsliq_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_p8(a, b, n) vsli_n_p8((a), (b), (n)) +#else + #define simde_vsli_n_p8(a, b, n) \ + simde_vreinterpret_p8_u8(simde_vsli_n_u8( \ + simde_vreinterpret_u8_p8((a)), simde_vreinterpret_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p8 + #define vsli_n_p8(a, b, n) simde_vsli_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_p16(a, b, n) vsli_n_p16((a), (b), (n)) +#else + #define simde_vsli_n_p16(a, b, n) \ + simde_vreinterpret_p16_u16(simde_vsli_n_u16( \ + simde_vreinterpret_u16_p16((a)), simde_vreinterpret_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p16 + #define vsli_n_p16(a, b, n) simde_vsli_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsli_n_p64(a, b, n) vsli_n_p64((a), (b), (n)) +#else + #define simde_vsli_n_p64(a, b, n) \ + simde_vreinterpret_p64_u64(simde_vsli_n_u64( \ + simde_vreinterpret_u64_p64((a)), simde_vreinterpret_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p64 + #define vsli_n_p64(a, b, n) simde_vsli_n_p64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_p8(a, b, n) vsliq_n_p8((a), (b), (n)) +#else + #define simde_vsliq_n_p8(a, b, n) \ + simde_vreinterpretq_p8_u8(simde_vsliq_n_u8( \ + simde_vreinterpretq_u8_p8((a)), simde_vreinterpretq_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p8 + #define vsliq_n_p8(a, b, n) simde_vsliq_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_p16(a, b, n) vsliq_n_p16((a), (b), (n)) +#else + #define simde_vsliq_n_p16(a, b, n) \ + simde_vreinterpretq_p16_u16(simde_vsliq_n_u16( \ + simde_vreinterpretq_u16_p16((a)), simde_vreinterpretq_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p16 + #define vsliq_n_p16(a, b, n) simde_vsliq_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsliq_n_p64(a, b, n) vsliq_n_p64((a), (b), (n)) +#else + #define simde_vsliq_n_p64(a, b, n) \ + simde_vreinterpretq_p64_u64(simde_vsliq_n_u64( \ + simde_vreinterpretq_u64_p64((a)), simde_vreinterpretq_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p64 + #define vsliq_n_p64(a, b, n) simde_vsliq_n_p64((a), (b), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SLI_N_H) */ diff --git a/arm/neon/sm3.h b/arm/neon/sm3.h new file mode 100644 index 000000000..c68d7c8eb --- /dev/null +++ b/arm/neon/sm3.h @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SM3_H) +#define SIMDE_ARM_NEON_SM3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define SIMDE_ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define SIMDE_ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define SIMDE_LSR(operand, shift) ((operand) >> (shift)) +#define SIMDE_LSL(operand, shift) ((operand) << (shift)) + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3ss1q_u32(n, m, a); + #else + simde_uint32x4_private + r_, + n_ = simde_uint32x4_to_private(n), + m_ = simde_uint32x4_to_private(m), + a_ = simde_uint32x4_to_private(a); + r_.values[3] = SIMDE_ROL32((SIMDE_ROL32(n_.values[3], 12) + m_.values[3] + a_.values[3]), 7); + r_.values[2] = 0; + r_.values[1] = 0; + r_.values[0] = 0; + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3ss1q_u32 + #define vsm3ss1q_u32(n, m, a) simde_vsm3ss1q_u32((n), (m), (a)) +#endif + +#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt1aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t WjPrime, TT1, SS2; + + WjPrime = c_.values[imm2]; + SS2 = b_.values[3] ^ SIMDE_ROL32(a_.values[3], 12); + TT1 = a_.values[1] ^ (a_.values[3] ^ a_.values[2]); + TT1 = (TT1 + a_.values[0] + SS2 + WjPrime); + r_.values[0] = a_.values[1]; + r_.values[1] = SIMDE_ROL32(a_.values[2], 9); + r_.values[2] = a_.values[3]; + r_.values[3] = TT1; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt1aq_u32(a, b, c, imm2) vsm3tt1aq_u32((a), (b), (c), (imm2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3tt1aq_u32 + #define vsm3tt1aq_u32(a, b, c, imm2) simde_vsm3tt1aq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt1bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t WjPrime, TT1, SS2; + + WjPrime = c_.values[imm2]; + SS2 = b_.values[3] ^ SIMDE_ROL32(a_.values[3], 12); + TT1 = (a_.values[3] & a_.values[1]) | (a_.values[3] & a_.values[2]) | (a_.values[1] & a_.values[2]); + TT1 = (TT1 + a_.values[0] + SS2 + WjPrime); + r_.values[0] = a_.values[1]; + r_.values[1] = SIMDE_ROL32(a_.values[2], 9); + r_.values[2] = a_.values[3]; + r_.values[3] = TT1; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt1bq_u32(a, b, c, imm2) vsm3tt1bq_u32((a), (b), (c), (imm2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3tt1bq_u32 + #define vsm3tt1bq_u32(a, b, c, imm2) simde_vsm3tt1bq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt2aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t Wj, TT2; + + Wj = c_.values[imm2]; + TT2 = a_.values[1] ^ (a_.values[3] ^ a_.values[2]); + TT2 = (TT2 + a_.values[0] + b_.values[3] + Wj); + r_.values[0] = a_.values[1]; + r_.values[1] = SIMDE_ROL32(a_.values[2], 19); + r_.values[2] = a_.values[3]; + r_.values[3] = TT2 ^ SIMDE_ROL32(TT2, 9) ^ SIMDE_ROL32(TT2, 17); + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt2aq_u32(a, b, c, imm2) vsm3tt2aq_u32((a), (b), (c), (imm2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3tt2aq_u32 + #define vsm3tt2aq_u32(a, b, c, imm2) simde_vsm3tt2aq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt2bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t Wj, TT2; + + Wj = c_.values[imm2]; + TT2 = (a_.values[3] & a_.values[2]) | (~(a_.values[3]) & a_.values[1]); + TT2 = (TT2 + a_.values[0] + b_.values[3] + Wj); + r_.values[0] = a_.values[1]; + r_.values[1] = SIMDE_ROL32(a_.values[2], 19); + r_.values[2] = a_.values[3]; + r_.values[3] = TT2 ^ SIMDE_ROL32(TT2, 9) ^ SIMDE_ROL32(TT2, 17); + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt2bq_u32(a, b, c, imm2) vsm3tt2bq_u32((a), (b), (c), (imm2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3tt2bq_u32 + #define vsm3tt2bq_u32(a, b, c, imm2) simde_vsm3tt2bq_u32((a), (b), (c), (imm2)) +#endif + +#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3partw1q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3partw1q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.values[2] = (a_.values[2] ^ b_.values[2]) ^ (SIMDE_ROL32(c_.values[3], 15)); + r_.values[1] = (a_.values[1] ^ b_.values[1]) ^ (SIMDE_ROL32(c_.values[2], 15)); + r_.values[0] = (a_.values[0] ^ b_.values[0]) ^ (SIMDE_ROL32(c_.values[1], 15)); + for(int i = 0; i < 4; ++i) { + if (i == 3) { + r_.values[3] = (a_.values[3] ^ b_.values[3]) ^ (SIMDE_ROL32(r_.values[0], 15)); + } + r_.values[i] = r_.values[i] ^ SIMDE_ROL32(r_.values[i], 15) ^ SIMDE_ROL32(r_.values[i], 23); + } + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3partw1q_u32 + #define vsm3partw1q_u32(a, b, c) simde_vsm3partw1q_u32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3partw2q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3partw2q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + tmp_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t tmp2; + tmp_.values[3] = b_.values[3] ^ (SIMDE_ROL32(c_.values[3], 7)); + tmp_.values[2] = b_.values[2] ^ (SIMDE_ROL32(c_.values[2], 7)); + tmp_.values[1] = b_.values[1] ^ (SIMDE_ROL32(c_.values[1], 7)); + tmp_.values[0] = b_.values[0] ^ (SIMDE_ROL32(c_.values[0], 7)); + r_.values[3] = a_.values[3] ^ tmp_.values[3]; + r_.values[2] = a_.values[2] ^ tmp_.values[2]; + r_.values[1] = a_.values[1] ^ tmp_.values[1]; + r_.values[0] = a_.values[0] ^ tmp_.values[0]; + tmp2 = SIMDE_ROL32(tmp_.values[0], 15); + tmp2 = tmp2 ^ SIMDE_ROL32(tmp2, 15) ^ SIMDE_ROL32(tmp2, 23); + r_.values[3] = r_.values[3] ^ tmp2; + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) + #undef vsm3partw2q_u32 + #define vsm3partw2q_u32(a, b, c) simde_vsm3partw2q_u32((a), (b), (c)) +#endif + +#undef SIMDE_ROR32 +#undef SIMDE_ROL32 +#undef SIMDE_LSR +#undef SIMDE_LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SM3_H) */ diff --git a/arm/neon/sm4.h b/arm/neon/sm4.h new file mode 100644 index 000000000..776ada4a5 --- /dev/null +++ b/arm/neon/sm4.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SM4_H) +#define SIMDE_ARM_NEON_SM4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define SIMDE_ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define SIMDE_ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define SIMDE_LSR(operand, shift) ((operand) >> (shift)) +#define SIMDE_LSL(operand, shift) ((operand) << (shift)) + +static const uint8_t simde_sbox_sm4[256] = { + 0xd6,0x90,0xe9,0xfe,0xcc,0xe1,0x3d,0xb7,0x16,0xb6,0x14,0xc2,0x28,0xfb,0x2c,0x05, + 0x2b,0x67,0x9a,0x76,0x2a,0xbe,0x04,0xc3,0xaa,0x44,0x13,0x26,0x49,0x86,0x06,0x99, + 0x9c,0x42,0x50,0xf4,0x91,0xef,0x98,0x7a,0x33,0x54,0x0b,0x43,0xed,0xcf,0xac,0x62, + 0xe4,0xb3,0x1c,0xa9,0xc9,0x08,0xe8,0x95,0x80,0xdf,0x94,0xfa,0x75,0x8f,0x3f,0xa6, + 0x47,0x07,0xa7,0xfc,0xf3,0x73,0x17,0xba,0x83,0x59,0x3c,0x19,0xe6,0x85,0x4f,0xa8, + 0x68,0x6b,0x81,0xb2,0x71,0x64,0xda,0x8b,0xf8,0xeb,0x0f,0x4b,0x70,0x56,0x9d,0x35, + 0x1e,0x24,0x0e,0x5e,0x63,0x58,0xd1,0xa2,0x25,0x22,0x7c,0x3b,0x01,0x21,0x78,0x87, + 0xd4,0x00,0x46,0x57,0x9f,0xd3,0x27,0x52,0x4c,0x36,0x02,0xe7,0xa0,0xc4,0xc8,0x9e, + 0xea,0xbf,0x8a,0xd2,0x40,0xc7,0x38,0xb5,0xa3,0xf7,0xf2,0xce,0xf9,0x61,0x15,0xa1, + 0xe0,0xae,0x5d,0xa4,0x9b,0x34,0x1a,0x55,0xad,0x93,0x32,0x30,0xf5,0x8c,0xb1,0xe3, + 0x1d,0xf6,0xe2,0x2e,0x82,0x66,0xca,0x60,0xc0,0x29,0x23,0xab,0x0d,0x53,0x4e,0x6f, + 0xd5,0xdb,0x37,0x45,0xde,0xfd,0x8e,0x2f,0x03,0xff,0x6a,0x72,0x6d,0x6c,0x5b,0x51, + 0x8d,0x1b,0xaf,0x92,0xbb,0xdd,0xbc,0x7f,0x11,0xd9,0x5c,0x41,0x1f,0x10,0x5a,0xd8, + 0x0a,0xc1,0x31,0x88,0xa5,0xcd,0x7b,0xbd,0x2d,0x74,0xd0,0x12,0xb8,0xe5,0xb4,0xb0, + 0x89,0x69,0x97,0x4a,0x0c,0x96,0x77,0x7e,0x65,0xb9,0xf1,0x09,0xc5,0x6e,0xc6,0x84, + 0x18,0xf0,0x7d,0xec,0x3a,0xdc,0x4d,0x20,0x79,0xee,0x5f,0x3e,0xd7,0xcb,0x39,0x48 +}; + +static void simde_u32_to_u8x4(uint32_t src, uint8_t* dst) { + for(int i = 0; i < 4; ++i) { + *(dst + i) = HEDLEY_STATIC_CAST(uint8_t, ((src << (i * 8)) >> 24)); + } +} + +static void simde_u32_from_u8x4(uint8_t* src, uint32_t* dst) { + *dst = 0; + for(int i = 0; i < 4; ++i) { + *dst = *dst | (HEDLEY_STATIC_CAST(uint32_t, src[i]) << (24 - i * 8)); + } +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm4eq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM4) + return vsm4eq_u32(a, b); + #else + simde_uint32x4_private + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t intval, roundkey; + uint8_t _intval[4]; + for(int index = 0; index < 4; ++index) { + roundkey = b_.values[index]; + + intval = a_.values[3] ^ a_.values[2] ^ a_.values[1] ^ roundkey; + + simde_u32_to_u8x4(intval, _intval); + for(int i = 0; i < 4; ++i) { + _intval[i] = simde_sbox_sm4[_intval[i]]; + } + simde_u32_from_u8x4(_intval, &intval); + intval = intval ^ SIMDE_ROL32(intval, 2) ^ SIMDE_ROL32(intval, 10) ^ SIMDE_ROL32(intval, 18) ^ SIMDE_ROL32(intval, 24); + intval = intval ^ a_.values[0]; + + a_.values[0] = a_.values[1]; + a_.values[1] = a_.values[2]; + a_.values[2] = a_.values[3]; + a_.values[3] = intval; + } + return simde_uint32x4_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM4)) + #undef vsm4eq_u32 + #define vsm4eq_u32(a, b) simde_vsm4eq_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm4ekeyq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM4) + return vsm4ekeyq_u32(a, b); + #else + simde_uint32x4_private + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t intval, constval; + uint8_t _intval[4]; + for(int index = 0; index < 4; ++index) { + constval = b_.values[index]; + + intval = a_.values[3] ^ a_.values[2] ^ a_.values[1] ^ constval; + + simde_u32_to_u8x4(intval, _intval); + for(int i = 0; i < 4; ++i) { + _intval[i] = simde_sbox_sm4[_intval[i]]; + } + simde_u32_from_u8x4(_intval, &intval); + intval = intval ^ SIMDE_ROL32(intval, 13) ^ SIMDE_ROL32(intval, 23); + intval = intval ^ a_.values[0]; + + a_.values[0] = a_.values[1]; + a_.values[1] = a_.values[2]; + a_.values[2] = a_.values[3]; + a_.values[3] = intval; + } + return simde_uint32x4_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM4)) + #undef vsm4ekeyq_u32 + #define vsm4ekeyq_u32(a, b) simde_vsm4ekeyq_u32((a), (b)) +#endif + +#undef SIMDE_ROR32 +#undef SIMDE_ROL32 +#undef SIMDE_LSR +#undef SIMDE_LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SM4_H) */ diff --git a/arm/neon/sqadd.h b/arm/neon/sqadd.h index 6e1b7e25c..9afd89fff 100644 --- a/arm/neon/sqadd.h +++ b/arm/neon/sqadd.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SQADD_H) @@ -30,6 +31,20 @@ #include "types.h" #include +// Workaround on ARM64 windows due to windows SDK bug +// https://developercommunity.visualstudio.com/t/In-arm64_neonh-vsqaddb_u8-vsqaddh_u16/10271747?sort=newest +#if (defined _MSC_VER) && (defined SIMDE_ARM_NEON_A64V8_NATIVE) && (_MSC_VER < 1938) +#pragma message ("Due to msvc bug, current version of msvc is supported by workaround. Recommend to update msvc") +#undef vsqaddb_u8 +#define vsqaddb_u8(src1, src2) neon_usqadds8(__uint8ToN8_v(src1), __int8ToN8_v(src2)).n8_u8[0] +#undef vsqaddh_u16 +#define vsqaddh_u16(src1, src2) neon_usqadds16(__uint16ToN16_v(src1), __int16ToN16_v(src2)).n16_u16[0] +#undef vsqadds_u32 +#define vsqadds_u32(src1, src2) _CopyUInt32FromFloat(neon_usqadds32(_CopyFloatFromUInt32(src1), _CopyFloatFromInt32(src2))) +#undef vsqaddd_u64 +#define vsqaddd_u64(src1, src2) neon_usqadds64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0] +#endif + HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ @@ -128,12 +143,20 @@ simde_vsqadd_u8(simde_uint8x8_t a, simde_int8x8_t b) { r_, a_ = simde_uint8x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t sum = __riscv_vreinterpret_v_u16m2_i16m2( + __riscv_vadd_vv_u16m2 ( + __riscv_vwcvtu_x_x_v_u16m2 (a_.sv64, 8), __riscv_vreinterpret_v_i16m2_u16m2( \ + __riscv_vwcvt_x_x_v_i16m2 (b_.sv64, 8)),8)); + r_.sv64 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vxm_u8m1(__riscv_vncvt_x_x_w_u8m1 \ + (__riscv_vreinterpret_v_i16m2_u16m2(sum), 8),255, __riscv_vmsgt_vx_i16m2_b8(sum, 255, 8), + 8), 0, __riscv_vmslt_vx_i16m2_b8(sum, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); + } + #endif return simde_uint8x8_from_private(r_); #endif } @@ -152,12 +175,19 @@ simde_vsqadd_u16(simde_uint16x4_t a, simde_int16x4_t b) { r_, a_ = simde_uint16x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t sum = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vv_u32m2 \ + (__riscv_vwcvtu_x_x_v_u32m2 (a_.sv64, 4), __riscv_vreinterpret_v_i32m2_u32m2( \ + __riscv_vwcvt_x_x_v_i32m2 (b_.sv64, 4)), 4)); + r_.sv64 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vxm_u16m1(__riscv_vncvt_x_x_w_u16m1( \ + __riscv_vreinterpret_v_i32m2_u32m2(sum), 4),UINT16_MAX,__riscv_vmsgt_vx_i32m2_b16(sum, UINT16_MAX, 4), + 4), 0, __riscv_vmslt_vx_i32m2_b16(sum, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif } @@ -176,12 +206,19 @@ simde_vsqadd_u32(simde_uint32x2_t a, simde_int32x2_t b) { r_, a_ = simde_uint32x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t sum = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vadd_vv_u64m2 (__riscv_vwcvtu_x_x_v_u64m2 (a_.sv64, 2), \ + __riscv_vreinterpret_v_i64m2_u64m2(__riscv_vwcvt_x_x_v_i64m2 (b_.sv64, 2)), 2)); + r_.sv64 = __riscv_vmerge_vxm_u32m1( + __riscv_vmerge_vxm_u32m1(__riscv_vncvt_x_x_w_u32m1(__riscv_vreinterpret_v_i64m2_u64m2(sum), 2), + UINT32_MAX,__riscv_vmsgt_vx_i64m2_b32(sum, UINT32_MAX, 2),2), 0, __riscv_vmslt_vx_i64m2_b32(sum, 0, 2), 2); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); - } - + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); + } + #endif return simde_uint32x2_from_private(r_); #endif } @@ -200,12 +237,18 @@ simde_vsqadd_u64(simde_uint64x1_t a, simde_int64x1_t b) { r_, a_ = simde_uint64x1_to_private(a); simde_int64x1_private b_ = simde_int64x1_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t sum = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vadd_vx_i64m1(b_.sv64, (int64_t)a_.values[0], 1)); + r_.sv64 = __riscv_vmerge_vvm_u64m1(__riscv_vmerge_vxm_u64m1(sum,UINT64_MAX,__riscv_vmsgtu_vx_u64m1_b64( \ + __riscv_vreinterpret_v_i64m1_u64m1(b_.sv64), UINT64_MAX - a_.values[0], 1), 1), __riscv_vmerge_vxm_u64m1( \ + sum, 0, __riscv_vmsgtu_vx_u64m1_b64(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vneg_v_i64m1(b_.sv64, 1)), \ + a_.values[0], 1), 1), __riscv_vmsle_vx_i64m1_b64(b_.sv64, 0, 1), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); + } + #endif return simde_uint64x1_from_private(r_); #endif } @@ -224,12 +267,18 @@ simde_vsqaddq_u8(simde_uint8x16_t a, simde_int8x16_t b) { r_, a_ = simde_uint8x16_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t sum = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vadd_vv_u16m2 (__riscv_vwcvtu_x_x_v_u16m2 \ + (a_.sv128, 16), __riscv_vreinterpret_v_i16m2_u16m2(__riscv_vwcvt_x_x_v_i16m2 (b_.sv128, 16)), 16)); + r_.sv128 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vxm_u8m1(__riscv_vncvt_x_x_w_u8m1( \ + __riscv_vreinterpret_v_i16m2_u16m2(sum), 16), 255, __riscv_vmsgt_vx_i16m2_b8(sum, 255, 16), 16), 0, \ + __riscv_vmslt_vx_i16m2_b8(sum, 0, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); + } + #endif return simde_uint8x16_from_private(r_); #endif } @@ -248,12 +297,18 @@ simde_vsqaddq_u16(simde_uint16x8_t a, simde_int16x8_t b) { r_, a_ = simde_uint16x8_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t sum = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vv_u32m2 (__riscv_vwcvtu_x_x_v_u32m2 \ + (a_.sv128, 8), __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vwcvt_x_x_v_i32m2 (b_.sv128, 8)), 8)); + r_.sv128 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vxm_u16m1(__riscv_vncvt_x_x_w_u16m1( \ + __riscv_vreinterpret_v_i32m2_u32m2(sum), 8), UINT16_MAX, __riscv_vmsgt_vx_i32m2_b16(sum, UINT16_MAX, 8), \ + 8), 0, __riscv_vmslt_vx_i32m2_b16(sum, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -272,12 +327,19 @@ simde_vsqaddq_u32(simde_uint32x4_t a, simde_int32x4_t b) { r_, a_ = simde_uint32x4_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t sum = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vadd_vv_u64m2 ( + __riscv_vwcvtu_x_x_v_u64m2 (a_.sv128, 4), __riscv_vreinterpret_v_i64m2_u64m2( \ + __riscv_vwcvt_x_x_v_i64m2 (b_.sv128, 4)), 4)); + r_.sv128 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vxm_u32m1( + __riscv_vncvt_x_x_w_u32m1(__riscv_vreinterpret_v_i64m2_u64m2(sum), 4), UINT32_MAX, + __riscv_vmsgt_vx_i64m2_b32(sum, UINT32_MAX, 4), 4), 0, __riscv_vmslt_vx_i64m2_b32(sum, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -296,12 +358,21 @@ simde_vsqaddq_u64(simde_uint64x2_t a, simde_int64x2_t b) { r_, a_ = simde_uint64x2_to_private(a); simde_int64x2_private b_ = simde_int64x2_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t sum = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vadd_vv_i64m1(b_.sv128, \ + __riscv_vreinterpret_v_u64m1_i64m1(a_.sv128), 2)); + r_.sv128 = __riscv_vmerge_vvm_u64m1( + __riscv_vmerge_vxm_u64m1(sum, UINT64_MAX, __riscv_vmsgtu_vv_u64m1_b64( + __riscv_vreinterpret_v_i64m1_u64m1(b_.sv128), __riscv_vsub_vv_u64m1(__riscv_vmv_v_x_u64m1(UINT64_MAX, 2), \ + a_.sv128, 2), 2), 2), __riscv_vmerge_vxm_u64m1(sum, 0, __riscv_vmsgtu_vv_u64m1_b64 \ + (__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vneg_v_i64m1(b_.sv128, 2)), a_.sv128, 2), 2), \ + __riscv_vmsle_vx_i64m1_b64(b_.sv128, 0, 2), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); + } + #endif return simde_uint64x2_from_private(r_); #endif } diff --git a/arm/neon/sqrt.h b/arm/neon/sqrt.h new file mode 100644 index 000000000..8ae870262 --- /dev/null +++ b/arm/neon/sqrt.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_SQRT_H) +#define SIMDE_ARM_NEON_SQRT_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_vsqrth_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrth_f16(a); + #elif defined(simde_math_sqrtf) + simde_float32 af = simde_float16_to_float32(a); + return simde_float16_from_float32(simde_math_sqrtf(af)); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsqrth_f16 + #define vsqrth_f16(a) simde_vsqrth_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vsqrt_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrt_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfsqrt_v_f16m1(a_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsqrt_f16 + #define vsqrt_f16(a) simde_vsqrt_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vsqrt_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrt_f32(a); + #elif defined(simde_math_sqrtf) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfsqrt_v_f32m1(a_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + #endif + return simde_float32x2_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrt_f32 + #define vsqrt_f32(a) simde_vsqrt_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vsqrt_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrt_f64(a); + #elif defined(simde_math_sqrt) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfsqrt_v_f64m1(a_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + #endif + + return simde_float64x1_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrt_f64 + #define vsqrt_f64(a) simde_vsqrt_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsqrtq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrtq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfsqrt_v_f16m1(a_.sv128, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsqrtq_f16 + #define vsqrtq_f16(a) simde_vsqrtq_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vsqrtq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrtq_f32(a); + #elif defined(simde_math_sqrtf) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfsqrt_v_f32m1(a_.sv128, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + #endif + return simde_float32x4_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrtq_f32 + #define vsqrtq_f32(a) simde_vsqrtq_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vsqrtq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrtq_f64(a); + #elif defined(simde_math_sqrt) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfsqrt_v_f64m1(a_.sv128, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + #endif + return simde_float64x2_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrtq_f64 + #define vsqrtq_f64(a) simde_vsqrtq_f64((a)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP +#endif /* !defined(SIMDE_ARM_NEON_SQRT_H) */ diff --git a/arm/neon/sri_n.h b/arm/neon/sri_n.h index f2b337703..d0213c2c9 100644 --- a/arm/neon/sri_n.h +++ b/arm/neon/sri_n.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SRI_N_H) @@ -266,6 +267,78 @@ SIMDE_BEGIN_DECLS_ #define vsriq_n_u64(a, b, n) simde_vsriq_n_u64((a), (b), (n)) #endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsri_n_p8(a, b, n) vsri_n_p8((a), (b), (n)) +#else + #define simde_vsri_n_p8(a, b, n) \ + simde_vreinterpret_p8_u8(simde_vsri_n_u8( \ + simde_vreinterpret_u8_p8((a)), simde_vreinterpret_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p8 + #define vsri_n_p8(a, b, n) simde_vsri_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsri_n_p16(a, b, n) vsri_n_p16((a), (b), (n)) +#else + #define simde_vsri_n_p16(a, b, n) \ + simde_vreinterpret_p16_u16(simde_vsri_n_u16( \ + simde_vreinterpret_u16_p16((a)), simde_vreinterpret_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p16 + #define vsri_n_p16(a, b, n) simde_vsri_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsri_n_p64(a, b, n) vsri_n_p64((a), (b), (n)) +#else + #define simde_vsri_n_p64(a, b, n) \ + simde_vreinterpret_p64_u64(simde_vsri_n_u64( \ + simde_vreinterpret_u64_p64((a)), simde_vreinterpret_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p64 + #define vsri_n_p64(a, b, n) simde_vsri_n_p64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsriq_n_p8(a, b, n) vsriq_n_p8((a), (b), (n)) +#else + #define simde_vsriq_n_p8(a, b, n) \ + simde_vreinterpretq_p8_u8(simde_vsriq_n_u8( \ + simde_vreinterpretq_u8_p8((a)), simde_vreinterpretq_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p8 + #define vsriq_n_p8(a, b, n) simde_vsriq_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsriq_n_p16(a, b, n) vsriq_n_p16((a), (b), (n)) +#else + #define simde_vsriq_n_p16(a, b, n) \ + simde_vreinterpretq_p16_u16(simde_vsriq_n_u16( \ + simde_vreinterpretq_u16_p16((a)), simde_vreinterpretq_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p16 + #define vsriq_n_p16(a, b, n) simde_vsriq_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsriq_n_p64(a, b, n) vsriq_n_p64((a), (b), (n)) +#else + #define simde_vsriq_n_p64(a, b, n) \ + simde_vreinterpretq_p64_u64(simde_vsriq_n_u64( \ + simde_vreinterpretq_u64_p64((a)), simde_vreinterpretq_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p64 + #define vsriq_n_p64(a, b, n) simde_vsriq_n_p64((a), (b), (n)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/st1.h b/arm/neon/st1.h index 6d5901aac..bf7ae5546 100644 --- a/arm/neon/st1.h +++ b/arm/neon/st1.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_H) @@ -40,10 +42,15 @@ simde_vst1_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4_t val vst1_f16(ptr, val); #else simde_float16x4_private val_ = simde_float16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1_f16 #define vst1_f16(a, b) simde_vst1_f16((a), (b)) #endif @@ -55,7 +62,11 @@ simde_vst1_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2_t val vst1_f32(ptr, val); #else simde_float32x2_private val_ = simde_float32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -70,7 +81,11 @@ simde_vst1_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_float64x1_t val vst1_f64(ptr, val); #else simde_float64x1_private val_ = simde_float64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +100,11 @@ simde_vst1_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int8x8_t val) { vst1_s8(ptr, val); #else simde_int8x8_private val_ = simde_int8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -100,7 +119,11 @@ simde_vst1_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4_t val) { vst1_s16(ptr, val); #else simde_int16x4_private val_ = simde_int16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -115,7 +138,11 @@ simde_vst1_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2_t val) { vst1_s32(ptr, val); #else simde_int32x2_private val_ = simde_int32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -130,7 +157,11 @@ simde_vst1_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_int64x1_t val) { vst1_s64(ptr, val); #else simde_int64x1_private val_ = simde_int64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -145,7 +176,11 @@ simde_vst1_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint8x8_t val) { vst1_u8(ptr, val); #else simde_uint8x8_private val_ = simde_uint8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -160,7 +195,11 @@ simde_vst1_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint16x4_t val) { vst1_u16(ptr, val); #else simde_uint16x4_private val_ = simde_uint16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -175,7 +214,11 @@ simde_vst1_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2_t val) { vst1_u32(ptr, val); #else simde_uint32x2_private val_ = simde_uint32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -190,7 +233,11 @@ simde_vst1_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_uint64x1_t val) { vst1_u64(ptr, val); #else simde_uint64x1_private val_ = simde_uint64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -208,12 +255,15 @@ simde_vst1q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x8_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_f16 #define vst1q_f16(a, b) simde_vst1q_f16((a), (b)) #endif @@ -223,13 +273,13 @@ void simde_vst1q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_f32(ptr, val); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(val, 0, ptr); #else simde_float32x4_private val_ = simde_float32x4_to_private(val); #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -250,6 +300,8 @@ simde_vst1q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -270,6 +322,8 @@ simde_vst1q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -290,6 +344,8 @@ simde_vst1q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -310,6 +366,8 @@ simde_vst1q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -330,6 +388,8 @@ simde_vst1q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -350,6 +410,8 @@ simde_vst1q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -370,6 +432,8 @@ simde_vst1q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -390,6 +454,8 @@ simde_vst1q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -410,6 +476,8 @@ simde_vst1q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -420,6 +488,169 @@ simde_vst1q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2_t val) { #define vst1q_u64(a, b) simde_vst1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly8x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_p8(ptr, val); + #else + simde_poly8x8_private val_ = simde_poly8x8_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p8 + #define vst1_p8(a, b) simde_vst1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_p16(ptr, val); + #else + simde_poly16x4_private val_ = simde_poly16x4_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p16 + #define vst1_p16(a, b) simde_vst1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly64x1_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst1_p64(ptr, val); + #else + simde_poly64x1_private val_ = simde_poly64x1_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_p64 + #define vst1_p64(a, b) simde_vst1_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x16_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_p8(ptr, val); + #else + simde_poly8x16_private val_ = simde_poly8x16_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p8 + #define vst1q_p8(a, b) simde_vst1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_p16(ptr, val); + #else + simde_poly16x8_private val_ = simde_poly16x8_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p16 + #define vst1q_p16(a, b) simde_vst1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst1q_p64(ptr, val); + #else + simde_poly64x2_private val_ = simde_poly64x2_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_p64 + #define vst1q_p64(a, b) simde_vst1q_p64((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vstrq_p128(simde_poly128_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly128_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + vstrq_p128(ptr, val); + #else + simde_memcpy(ptr, &val, sizeof(val)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_CRYPTO)) + #undef vstrq_p128 + #define vstrq_p128(a, b) simde_vstrq_p128((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16(ptr, val); + #else + simde_bfloat16x4_private val_ = simde_bfloat16x4_to_private(val); + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1_bf16 + #define vst1_bf16(a, b) simde_vst1_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16(ptr, val); + #else + simde_bfloat16x8_private val_ = simde_bfloat16x8_to_private(val); + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1q_bf16 + #define vst1q_bf16(a, b) simde_vst1q_bf16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/st1_lane.h b/arm/neon/st1_lane.h index f0e78365c..8d01b13de 100644 --- a/arm/neon/st1_lane.h +++ b/arm/neon/st1_lane.h @@ -23,16 +23,35 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST1_LANE_H) #define SIMDE_ARM_NEON_ST1_LANE_H + #include "types.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_f16(simde_float16_t *ptr, simde_float16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private val_ = simde_float16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1_lane_f16 + #define vst1_lane_f16(a, b, c) simde_vst1_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_lane_f32(simde_float32_t *ptr, simde_float32x2_t val, const int lane) @@ -196,6 +215,23 @@ simde_vst1_lane_u64(uint64_t *ptr, simde_uint64x1_t val, const int lane) #define vst1_lane_u64(a, b, c) simde_vst1_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_f16(simde_float16_t *ptr, simde_float16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private val_ = simde_float16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1q_lane_f16 + #define vst1q_lane_f16(a, b, c) simde_vst1q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst1q_lane_f32(simde_float32_t *ptr, simde_float32x4_t val, const int lane) @@ -356,6 +392,137 @@ simde_vst1q_lane_u64(uint64_t *ptr, simde_uint64x2_t val, const int lane) #define vst1q_lane_u64(a, b, c) simde_vst1q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p8(simde_poly8_t *ptr, simde_poly8x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private val_ = simde_poly8x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p8 + #define vst1_lane_p8(a, b, c) simde_vst1_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p16(simde_poly16_t *ptr, simde_poly16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private val_ = simde_poly16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p16 + #define vst1_lane_p16(a, b, c) simde_vst1_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p64(simde_poly64_t *ptr, simde_poly64x1_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + (void) lane; + vst1_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private val_ = simde_poly64x1_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p64 + #define vst1_lane_p64(a, b, c) simde_vst1_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p8(simde_poly8_t *ptr, simde_poly8x16_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst1q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private val_ = simde_poly8x16_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p8 + #define vst1q_lane_p8(a, b, c) simde_vst1q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p16(simde_poly16_t *ptr, simde_poly16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private val_ = simde_poly16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p16 + #define vst1q_lane_p16(a, b, c) simde_vst1q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p64(simde_poly64_t *ptr, simde_poly64x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst1q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private val_ = simde_poly64x2_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p64 + #define vst1q_lane_p64(a, b, c) simde_vst1q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private val_ = simde_bfloat16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1_lane_bf16 + #define vst1_lane_bf16(a, b, c) simde_vst1_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private val_ = simde_bfloat16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1q_lane_bf16 + #define vst1q_lane_bf16(a, b, c) simde_vst1q_lane_bf16((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/st1_x2.h b/arm/neon/st1_x2.h new file mode 100644 index 000000000..fed93bd65 --- /dev/null +++ b/arm/neon/st1_x2.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X2_H) +#define SIMDE_ARM_NEON_ST1_X2_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f16_x2(ptr, val); + #else + simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && !defined(SIMDE_BUG_GCC_REV_260989))) + #undef vst1_f16_x2 + #define vst1_f16_x2(ptr, val) simde_vst1_f16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f32_x2(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_f32_x2 + #define vst1_f32_x2(ptr, val) simde_vst1_f32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x2(simde_float64 ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1_f64_x2(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1_f64_x2 + #define vst1_f64_x2(ptr, val) simde_vst1_f64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s8_x2(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s8_x2 + #define vst1_s8_x2(ptr, val) simde_vst1_s8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s16_x2(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s16_x2 + #define vst1_s16_x2(ptr, val) simde_vst1_s16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s32_x2(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s32_x2 + #define vst1_s32_x2(ptr, val) simde_vst1_s32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s64_x2(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s64_x2 + #define vst1_s64_x2(ptr, val) simde_vst1_s64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u8_x2(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u8_x2 + #define vst1_u8_x2(ptr, val) simde_vst1_u8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u16_x2(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u16_x2 + #define vst1_u16_x2(ptr, val) simde_vst1_u16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u32_x2(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u32_x2 + #define vst1_u32_x2(ptr, val) simde_vst1_u32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u64_x2(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u64_x2 + #define vst1_u64_x2(ptr, val) simde_vst1_u64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p8_x2(ptr, val); + #else + simde_poly8x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p8_x2 + #define vst1_p8_x2(a, b) simde_vst1_p8_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p16_x2(ptr, val); + #else + simde_poly16x4_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p16_x2 + #define vst1_p16_x2(a, b) simde_vst1_p16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p64_x2(ptr, val); + #else + simde_poly64x1_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p64_x2 + #define vst1_p64_x2(a, b) simde_vst1_p64_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x2(ptr, val); + #else + simde_bfloat16x4_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1_bf16_x2 + #define vst1_bf16_x2(a, b) simde_vst1_bf16_x2((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X2_H) */ diff --git a/arm/neon/st1_x3.h b/arm/neon/st1_x3.h new file mode 100644 index 000000000..097c4aabd --- /dev/null +++ b/arm/neon/st1_x3.h @@ -0,0 +1,341 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X3_H) +#define SIMDE_ARM_NEON_ST1_X3_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1_f16_x3(ptr, val); + #else + simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[2].sv64 , 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1_f16_x3 + #define vst1_f16_x3(a, b) simde_vst1_f16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f32_x3(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + simde_vst1_f32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_f32_x3 + #define vst1_f32_x3(ptr, val) simde_vst1_f32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x3(simde_float64 ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1_f64_x3(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + simde_vst1_f64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1_f64_x3 + #define vst1_f64_x3(ptr, val) simde_vst1_f64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s8_x3(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + simde_vst1_s8(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s8_x3 + #define vst1_s8_x3(ptr, val) simde_vst1_s8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s16_x3(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + simde_vst1_s16(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s16_x3 + #define vst1_s16_x3(ptr, val) simde_vst1_s16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s32_x3(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + simde_vst1_s32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s32_x3 + #define vst1_s32_x3(ptr, val) simde_vst1_s32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s64_x3(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + simde_vst1_s64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_s64_x3 + #define vst1_s64_x3(ptr, val) simde_vst1_s64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u8_x3(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + simde_vst1_u8(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u8_x3 + #define vst1_u8_x3(ptr, val) simde_vst1_u8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u16_x3(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + simde_vst1_u16(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u16_x3 + #define vst1_u16_x3(ptr, val) simde_vst1_u16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u32_x3(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + simde_vst1_u32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u32_x3 + #define vst1_u32_x3(ptr, val) simde_vst1_u32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u64_x3(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + simde_vst1_u64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1_u64_x3 + #define vst1_u64_x3(ptr, val) simde_vst1_u64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p8_x3(ptr, val); + #else + simde_poly8x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p8_x3 + #define vst1_p8_x3(a, b) simde_vst1_p8_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p16_x3(ptr, val); + #else + simde_poly16x4_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p16_x3 + #define vst1_p16_x3(a, b) simde_vst1_p16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p64_x3(ptr, val); + #else + simde_poly64x1_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1_p64_x3 + #define vst1_p64_x3(a, b) simde_vst1_p64_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x3(ptr, val); + #else + simde_bfloat16x4_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1_bf16_x3 + #define vst1_bf16_x3(a, b) simde_vst1_bf16_x3((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X3_H) */ diff --git a/arm/neon/st1_x4.h b/arm/neon/st1_x4.h new file mode 100644 index 000000000..5d3ed1541 --- /dev/null +++ b/arm/neon/st1_x4.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X4_H) +#define SIMDE_ARM_NEON_ST1_X4_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1_f16_x4(ptr, val); + #else + simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[2].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+12 , a_[3].sv64 , 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16 ; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1_f16_x4 + #define vst1_f16_x4(a, b) simde_vst1_f16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_f32_x4(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + simde_vst1_f32(ptr+4, val.val[2]); + simde_vst1_f32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_f32_x4 + #define vst1_f32_x4(ptr, val) simde_vst1_f32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_114521) + vst1_f64_x4(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + simde_vst1_f64(ptr+2, val.val[2]); + simde_vst1_f64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_114521)) + #undef vst1_f64_x4 + #define vst1_f64_x4(ptr, val) simde_vst1_f64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_s8_x4(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + simde_vst1_s8(ptr+16, val.val[2]); + simde_vst1_s8(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_s8_x4 + #define vst1_s8_x4(ptr, val) simde_vst1_s8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_s16_x4(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + simde_vst1_s16(ptr+8, val.val[2]); + simde_vst1_s16(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_s16_x4 + #define vst1_s16_x4(ptr, val) simde_vst1_s16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_s32_x4(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + simde_vst1_s32(ptr+4, val.val[2]); + simde_vst1_s32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_s32_x4 + #define vst1_s32_x4(ptr, val) simde_vst1_s32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_s64_x4(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + simde_vst1_s64(ptr+2, val.val[2]); + simde_vst1_s64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_s64_x4 + #define vst1_s64_x4(ptr, val) simde_vst1_s64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_u8_x4(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + simde_vst1_u8(ptr+16, val.val[2]); + simde_vst1_u8(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_u8_x4 + #define vst1_u8_x4(ptr, val) simde_vst1_u8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_u16_x4(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + simde_vst1_u16(ptr+8, val.val[2]); + simde_vst1_u16(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_u16_x4 + #define vst1_u16_x4(ptr, val) simde_vst1_u16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_u32_x4(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + simde_vst1_u32(ptr+4, val.val[2]); + simde_vst1_u32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_u32_x4 + #define vst1_u32_x4(ptr, val) simde_vst1_u32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_u64_x4(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + simde_vst1_u64(ptr+2, val.val[2]); + simde_vst1_u64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_u64_x4 + #define vst1_u64_x4(ptr, val) simde_vst1_u64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_p8_x4(ptr, val); + #else + simde_poly8x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+24 , val_[3].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_p8_x4 + #define vst1_p8_x4(a, b) simde_vst1_p8_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_p16_x4(ptr, val); + #else + simde_poly16x4_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+12 , val_[3].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_p16_x4 + #define vst1_p16_x4(a, b) simde_vst1_p16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) + vst1_p64_x4(ptr, val); + #else + simde_poly64x1_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+3 , val_[3].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) + #undef vst1_p64_x4 + #define vst1_p64_x4(a, b) simde_vst1_p64_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x4(ptr, val); + #else + simde_bfloat16x4_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1_bf16_x4 + #define vst1_bf16_x4(a, b) simde_vst1_bf16_x4((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X4_H) */ diff --git a/arm/neon/st1q_x2.h b/arm/neon/st1q_x2.h new file mode 100644 index 000000000..aca91ee90 --- /dev/null +++ b/arm/neon/st1q_x2.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X2_H) +#define SIMDE_ARM_NEON_ST1Q_X2_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x2(ptr, val); + #else + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1q_f16_x2 + #define vst1q_f16_x2(a, b) simde_vst1q_f16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x2(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_f32_x2 + #define vst1q_f32_x2(ptr, val) simde_vst1q_f32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x2(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x2(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x2 + #define vst1q_f64_x2(ptr, val) simde_vst1q_f64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x2(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s8_x2 + #define vst1q_s8_x2(ptr, val) simde_vst1q_s8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x2(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s16_x2 + #define vst1q_s16_x2(ptr, val) simde_vst1q_s16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x2(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s32_x2 + #define vst1q_s32_x2(ptr, val) simde_vst1q_s32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x2(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s64_x2 + #define vst1q_s64_x2(ptr, val) simde_vst1q_s64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x2(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u8_x2 + #define vst1q_u8_x2(ptr, val) simde_vst1q_u8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x2(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u16_x2 + #define vst1q_u16_x2(ptr, val) simde_vst1q_u16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x2(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u32_x2 + #define vst1q_u32_x2(ptr, val) simde_vst1q_u32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x2(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u64_x2 + #define vst1q_u64_x2(ptr, val) simde_vst1q_u64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x2(ptr, val); + #else + simde_poly8x16_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p8_x2 + #define vst1q_p8_x2(a, b) simde_vst1q_p8_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x2(ptr, val); + #else + simde_poly16x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p16_x2 + #define vst1q_p16_x2(a, b) simde_vst1q_p16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x2(ptr, val); + #else + simde_poly64x2_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p64_x2 + #define vst1q_p64_x2(a, b) simde_vst1q_p64_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x2(ptr, val); + #else + simde_bfloat16x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1q_bf16_x2 + #define vst1q_bf16_x2(a, b) simde_vst1q_bf16_x2((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X2_H) */ diff --git a/arm/neon/st1q_x3.h b/arm/neon/st1q_x3.h new file mode 100644 index 000000000..73dcb9c8e --- /dev/null +++ b/arm/neon/st1q_x3.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X3_H) +#define SIMDE_ARM_NEON_ST1Q_X3_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x3(ptr, val); + #else + simde_float16x8_private a[3] = { simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a[2].sv128 , 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1q_f16_x3 + #define vst1q_f16_x3(a, b) simde_vst1q_f16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x3(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + simde_vst1q_f32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_f32_x3 + #define vst1q_f32_x3(ptr, val) simde_vst1q_f32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x3(simde_float64 ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x3(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + simde_vst1q_f64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x3 + #define vst1q_f64_x3(ptr, val) simde_vst1q_f64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x3(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + simde_vst1q_s8(ptr+32, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s8_x3 + #define vst1q_s8_x3(ptr, val) simde_vst1q_s8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x3(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + simde_vst1q_s16(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s16_x3 + #define vst1q_s16_x3(ptr, val) simde_vst1q_s16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x3(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + simde_vst1q_s32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s32_x3 + #define vst1q_s32_x3(ptr, val) simde_vst1q_s32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x3(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + simde_vst1q_s64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s64_x3 + #define vst1q_s64_x3(ptr, val) simde_vst1q_s64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x3(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + simde_vst1q_u8(ptr+32, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u8_x3 + #define vst1q_u8_x3(ptr, val) simde_vst1q_u8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x3(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + simde_vst1q_u16(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u16_x3 + #define vst1q_u16_x3(ptr, val) simde_vst1q_u16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x3(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + simde_vst1q_u32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u32_x3 + #define vst1q_u32_x3(ptr, val) simde_vst1q_u32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x3(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + simde_vst1q_u64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u64_x3 + #define vst1q_u64_x3(ptr, val) simde_vst1q_u64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x3(ptr, val); + #else + simde_poly8x16_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p8_x3 + #define vst1q_p8_x3(a, b) simde_vst1q_p8_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x3(ptr, val); + #else + simde_poly16x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p16_x3 + #define vst1q_p16_x3(a, b) simde_vst1q_p16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x3(ptr, val); + #else + simde_poly64x2_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p64_x3 + #define vst1q_p64_x3(a, b) simde_vst1q_p64_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x3(ptr, val); + #else + simde_bfloat16x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1q_bf16_x3 + #define vst1q_bf16_x3(a, b) simde_vst1q_bf16_x3((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X3_H) */ diff --git a/arm/neon/st1q_x4.h b/arm/neon/st1q_x4.h new file mode 100644 index 000000000..a489e448e --- /dev/null +++ b/arm/neon/st1q_x4.h @@ -0,0 +1,354 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X4_H) +#define SIMDE_ARM_NEON_ST1Q_X4_H + +#include "types.h" +#include "st1.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_float16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x4(ptr, val); + #else + simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a_[2].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+24 , a_[3].sv128 , 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < 32 ; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst1q_f16_x4 + #define vst1q_f16_x4(a, b) simde_vst1q_f16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(16)], simde_float32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x4(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + simde_vst1q_f32(ptr+8, val.val[2]); + simde_vst1q_f32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_f32_x4 + #define vst1q_f32_x4(ptr, val) simde_vst1q_f32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x4(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + simde_vst1q_f64(ptr+4, val.val[2]); + simde_vst1q_f64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x4 + #define vst1q_f64_x4(ptr, val) simde_vst1q_f64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_int8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x4(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + simde_vst1q_s8(ptr+32, val.val[2]); + simde_vst1q_s8(ptr+48, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s8_x4 + #define vst1q_s8_x4(ptr, val) simde_vst1q_s8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x4(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + simde_vst1q_s16(ptr+16, val.val[2]); + simde_vst1q_s16(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s16_x4 + #define vst1q_s16_x4(ptr, val) simde_vst1q_s16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x4(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + simde_vst1q_s32(ptr+8, val.val[2]); + simde_vst1q_s32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s32_x4 + #define vst1q_s32_x4(ptr, val) simde_vst1q_s32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x4(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + simde_vst1q_s64(ptr+4, val.val[2]); + simde_vst1q_s64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_s64_x4 + #define vst1q_s64_x4(ptr, val) simde_vst1q_s64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_uint8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x4(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + simde_vst1q_u8(ptr+32, val.val[2]); + simde_vst1q_u8(ptr+48, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u8_x4 + #define vst1q_u8_x4(ptr, val) simde_vst1q_u8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x4(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + simde_vst1q_u16(ptr+16, val.val[2]); + simde_vst1q_u16(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u16_x4 + #define vst1q_u16_x4(ptr, val) simde_vst1q_u16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x4(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + simde_vst1q_u32(ptr+8, val.val[2]); + simde_vst1q_u32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u32_x4 + #define vst1q_u32_x4(ptr, val) simde_vst1q_u32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x4(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + simde_vst1q_u64(ptr+4, val.val[2]); + simde_vst1q_u64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) + #undef vst1q_u64_x4 + #define vst1q_u64_x4(ptr, val) simde_vst1q_u64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_poly8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x4(ptr, val); + #else + simde_poly8x16_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+48 , val_[3].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p8_x4 + #define vst1q_p8_x4(a, b) simde_vst1q_p8_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x4(ptr, val); + #else + simde_poly16x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+24 , val_[3].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p16_x4 + #define vst1q_p16_x4(a, b) simde_vst1q_p16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x4(ptr, val); + #else + simde_poly64x2_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+6 , val_[3].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) + #undef vst1q_p64_x4 + #define vst1q_p64_x4(a, b) simde_vst1q_p64_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_bfloat16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x4(ptr, val); + #else + simde_bfloat16x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst1q_bf16_x4 + #define vst1q_bf16_x4(a, b) simde_vst1q_bf16_x4((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X4_H) */ diff --git a/arm/neon/st2.h b/arm/neon/st2.h index 9dcaef633..20dc145a9 100644 --- a/arm/neon/st2.h +++ b/arm/neon/st2.h @@ -23,12 +23,15 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST2_H) #define SIMDE_ARM_NEON_ST2_H -#include "types.h" +#include "st1.h" +#include "combine.h" #include "zip.h" HEDLEY_DIAGNOSTIC_PUSH @@ -37,19 +40,55 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_f16(simde_float16_t *ptr, simde_float16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst2_f16(ptr, val); + #else + simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst2_f16 + #define vst2_f16(a, b) simde_vst2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2_f32(simde_float32_t *ptr, simde_float32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_f32(ptr, val); #else - simde_float32_t buf[4]; simde_float32x2_private a_[2] = {simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 2); + #else + simde_float32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -63,13 +102,20 @@ simde_vst2_f64(simde_float64_t *ptr, simde_float64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2_f64(ptr, val); #else - simde_float64_t buf[2]; simde_float64x1_private a_[2] = {simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 1); + #else + simde_float64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -83,13 +129,20 @@ simde_vst2_s8(int8_t *ptr, simde_int8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s8(ptr, val); #else - int8_t buf[16]; simde_int8x8_private a_[2] = {simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 8); + #else + int8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -103,13 +156,20 @@ simde_vst2_s16(int16_t *ptr, simde_int16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s16(ptr, val); #else - int16_t buf[8]; simde_int16x4_private a_[2] = {simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 4); + #else + int16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -123,13 +183,20 @@ simde_vst2_s32(int32_t *ptr, simde_int32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s32(ptr, val); #else - int32_t buf[4]; simde_int32x2_private a_[2] = {simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 2); + #else + int32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -143,13 +210,20 @@ simde_vst2_s64(int64_t *ptr, simde_int64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s64(ptr, val); #else - int64_t buf[2]; simde_int64x1_private a_[2] = {simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 1); + #else + int64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -162,14 +236,37 @@ void simde_vst2_u8(uint8_t *ptr, simde_uint8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u8(ptr, val); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + simde_uint8x16_private r0_; + simde_uint8x16_private ab_ = simde_uint8x16_to_private(simde_vcombine_u8(val.val[0], val.val[1])); + + r0_.v128 = wasm_i8x16_shuffle(ab_.v128, ab_.v128, + 0, 8, + 1, 9, + 2, 10, + 3, 11, + 4, 12, + 5, 13, + 6, 14, + 7, 15 + ); + + wasm_v128_store(ptr, r0_.v128); #else - uint8_t buf[16]; simde_uint8x8_private a_[2] = {simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + uint8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -183,13 +280,20 @@ simde_vst2_u16(uint16_t *ptr, simde_uint16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u16(ptr, val); #else - uint16_t buf[8]; simde_uint16x4_private a_[2] = {simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + uint16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -203,13 +307,20 @@ simde_vst2_u32(uint32_t *ptr, simde_uint32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u32(ptr, val); #else - uint32_t buf[4]; simde_uint32x2_private a_[2] = {simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 2); + #else + uint32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -223,13 +334,20 @@ simde_vst2_u64(uint64_t *ptr, simde_uint64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u64(ptr, val); #else - uint64_t buf[2]; simde_uint64x1_private a_[2] = {simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + uint64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -237,11 +355,43 @@ simde_vst2_u64(uint64_t *ptr, simde_uint64x1x2_t val) { #define vst2_u64(a, b) simde_vst2_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_f16(simde_float16_t *ptr, simde_float16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst2q_f16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 8); + #else + simde_float16x8x2_t r = simde_vzipq_f16(val.val[0], val.val[1]); + simde_vst1q_f16(ptr, r.val[0]); + simde_vst1q_f16(ptr+8, r.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst2q_f16 + #define vst2q_f16(a, b) simde_vst2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2q_f32(simde_float32_t *ptr, simde_float32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_f32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_float32x4_private a_[2] = {simde_float32x4_to_private(val.val[0]), + simde_float32x4_to_private(val.val[1])}; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 4); #else simde_float32x4x2_t r = simde_vzipq_f32(val.val[0], val.val[1]); simde_vst1q_f32(ptr, r.val[0]); @@ -259,13 +409,20 @@ simde_vst2q_f64(simde_float64_t *ptr, simde_float64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x2_private a_[2] = {simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 2); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -278,6 +435,13 @@ void simde_vst2q_s8(int8_t *ptr, simde_int8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int8x16_private a_[2] = {simde_int8x16_to_private(val.val[0]), + simde_int8x16_to_private(val.val[1])}; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 16); #else simde_int8x16x2_t r = simde_vzipq_s8(val.val[0], val.val[1]); simde_vst1q_s8(ptr, r.val[0]); @@ -294,6 +458,13 @@ void simde_vst2q_s16(int16_t *ptr, simde_int16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int16x8_private a_[2] = {simde_int16x8_to_private(val.val[0]), + simde_int16x8_to_private(val.val[1])}; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 8); #else simde_int16x8x2_t r = simde_vzipq_s16(val.val[0], val.val[1]); simde_vst1q_s16(ptr, r.val[0]); @@ -310,6 +481,13 @@ void simde_vst2q_s32(int32_t *ptr, simde_int32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int32x4_private a_[2] = {simde_int32x4_to_private(val.val[0]), + simde_int32x4_to_private(val.val[1])}; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 4); #else simde_int32x4x2_t r = simde_vzipq_s32(val.val[0], val.val[1]); simde_vst1q_s32(ptr, r.val[0]); @@ -326,6 +504,13 @@ void simde_vst2q_s64(int64_t *ptr, simde_int64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_s64(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), + simde_int64x2_to_private(val.val[1])}; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 2); #else int64_t buf[4]; simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), @@ -346,6 +531,13 @@ void simde_vst2q_u8(uint8_t *ptr, simde_uint8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint8x16_private a_[2] = {simde_uint8x16_to_private(val.val[0]), + simde_uint8x16_to_private(val.val[1])}; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); #else simde_uint8x16x2_t r = simde_vzipq_u8(val.val[0], val.val[1]); simde_vst1q_u8(ptr, r.val[0]); @@ -362,6 +554,13 @@ void simde_vst2q_u16(uint16_t *ptr, simde_uint16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint16x8_private a_[2] = {simde_uint16x8_to_private(val.val[0]), + simde_uint16x8_to_private(val.val[1])}; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); #else simde_uint16x8x2_t r = simde_vzipq_u16(val.val[0], val.val[1]); simde_vst1q_u16(ptr, r.val[0]); @@ -378,6 +577,13 @@ void simde_vst2q_u32(uint32_t *ptr, simde_uint32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + simde_uint32x4_private a_[2] = {simde_uint32x4_to_private(val.val[0]), + simde_uint32x4_to_private(val.val[1])}; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 4); #else simde_uint32x4x2_t r = simde_vzipq_u32(val.val[0], val.val[1]); simde_vst1q_u32(ptr, r.val[0]); @@ -395,18 +601,229 @@ simde_vst2q_u64(uint64_t *ptr, simde_uint64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x2_private a_[2] = {simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_u64 + #define vst2q_u64(a, b) simde_vst2q_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p8(simde_poly8_t *ptr, simde_poly8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2_p8(ptr, val); + #else + simde_poly8x8_private a_[2] = {simde_poly8x8_to_private(val.val[0]), + simde_poly8x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + simde_poly8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_p8 + #define vst2_p8(a, b) simde_vst2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p16(simde_poly16_t *ptr, simde_poly16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2_p16(ptr, val); + #else + simde_poly16x4_private a_[2] = {simde_poly16x4_to_private(val.val[0]), + simde_poly16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + simde_poly16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_p16 + #define vst2_p16(a, b) simde_vst2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p64(simde_poly64_t *ptr, simde_poly64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst2_p64(ptr, val); + #else + simde_poly64x1_private a_[2] = {simde_poly64x1_to_private(val.val[0]), + simde_poly64x1_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + simde_poly64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2_p64 + #define vst2_p64(a, b) simde_vst2_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p8(simde_poly8_t *ptr, simde_poly8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2q_p8(ptr, val); + #else + simde_poly8x16_private a_[2] = {simde_poly8x16_to_private(val.val[0]), + simde_poly8x16_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_p8 + #define vst2q_p8(a, b) simde_vst2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p16(simde_poly16_t *ptr, simde_poly16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2q_p16(ptr, val); + #else + simde_poly16x8_private a_[2] = {simde_poly16x8_to_private(val.val[0]), + simde_poly16x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_p16 + #define vst2q_p16(a, b) simde_vst2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p64(simde_poly64_t *ptr, simde_poly64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst2q_p64(ptr, val); + #else + simde_poly64x2_private a_[2] = {simde_poly64x2_to_private(val.val[0]), + simde_poly64x2_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_p64 + #define vst2q_p64(a, b) simde_vst2q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst2_bf16(ptr, val); + #else + simde_bfloat16x4_private a_[2] = {simde_bfloat16x4_to_private(val.val[0]), + simde_bfloat16x4_to_private(val.val[1])}; + simde_bfloat16_t buf[8]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { buf[i] = a_[i % 2].values[i / 2]; } simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vst2q_u64 - #define vst2q_u64(a, b) simde_vst2q_u64((a), (b)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst2_bf16 + #define vst2_bf16(a, b) simde_vst2_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst2q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[2] = {simde_bfloat16x8_to_private(val.val[0]), + simde_bfloat16x8_to_private(val.val[1])}; + simde_bfloat16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst2q_bf16 + #define vst2q_bf16(a, b) simde_vst2q_bf16((a), (b)) #endif #endif /* !defined(SIMDE_BUG_INTEL_857088) */ diff --git a/arm/neon/st2_lane.h b/arm/neon/st2_lane.h index 0eee6a8a4..eb43dfcc1 100644 --- a/arm/neon/st2_lane.h +++ b/arm/neon/st2_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST2_LANE_H) @@ -189,6 +190,26 @@ simde_vst2_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t val, #define vst2_lane_u64(a, b, c) simde_vst2_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private r; + for (size_t i = 0 ; i < 2 ; i ++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst2_lane_f16 + #define vst2_lane_f16(a, b, c) simde_vst2_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2x2_t val, const int lane) @@ -380,6 +401,26 @@ simde_vst2q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2x2_t val #define vst2q_lane_u64(a, b, c) simde_vst2q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst2q_lane_f16 + #define vst2q_lane_f16(a, b, c) simde_vst2q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x4x2_t val, const int lane) @@ -418,6 +459,161 @@ simde_vst2q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2 #define vst2q_lane_f64(a, b, c) simde_vst2q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p8 + #define vst2_lane_p8(a, b, c) simde_vst2_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p16 + #define vst2_lane_p16(a, b, c) simde_vst2_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + HEDLEY_STATIC_CAST(void, lane); + vst2_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p64 + #define vst2_lane_p64(a, b, c) simde_vst2_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x16x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 16) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst2q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p8 + #define vst2q_lane_p8(a, b, c) simde_vst2q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p16 + #define vst2q_lane_p16(a, b, c) simde_vst2q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst2q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p64 + #define vst2q_lane_p64(a, b, c) simde_vst2q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 2 ; i ++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst2_lane_bf16 + #define vst2_lane_bf16(a, b, c) simde_vst2_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst2q_lane_bf16 + #define vst2q_lane_bf16(a, b, c) simde_vst2q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/st3.h b/arm/neon/st3.h index 2a3616d42..8849fa130 100644 --- a/arm/neon/st3.h +++ b/arm/neon/st3.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST3_H) @@ -37,6 +39,37 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst3_f16(ptr, val); + #else + simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst3_f16 + #define vst3_f16(a, b) simde_vst3_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t val) { @@ -46,7 +79,13 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_float32x2_private a[3] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -55,7 +94,7 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -76,9 +115,17 @@ simde_vst3_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t v simde_float64x1_private a_[3] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_f64m1x3(ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -95,7 +142,13 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_int8x8_private a_[3] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_i8m1x3(ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -115,7 +168,7 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -136,7 +189,13 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_int16x4_private a_[3] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -156,7 +215,7 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -177,7 +236,13 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_int32x2_private a[3] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -186,7 +251,7 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -207,12 +272,20 @@ simde_vst3_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { simde_int64x1_private a_[3] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst3_s64 #define vst3_s64(a, b) simde_vst3_s64((a), (b)) #endif @@ -226,7 +299,13 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_uint8x8_private a_[3] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -246,7 +325,7 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -267,7 +346,13 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_uint16x4_private a_[3] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -287,7 +372,7 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -308,7 +393,13 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_uint32x2_private a[3] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -317,7 +408,7 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -338,16 +429,55 @@ simde_vst3_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { simde_uint64x1_private a_[3] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst3_u64 #define vst3_u64(a, b) simde_vst3_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst3q_f16(ptr, val); + #else + simde_float16x8_private a_[3] = { simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst3q_f16 + #define vst3q_f16(a, b) simde_vst3q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t val) { @@ -357,7 +487,13 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_float32x4_private a_[3] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -377,7 +513,7 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else simde_float32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -398,7 +534,13 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_float64x2_private a[3] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_f64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -407,7 +549,7 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -428,7 +570,13 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_int8x16_private a_[3] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_i8m1x3 (ptr, dest, 16); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, 4, 20, 10, 5); @@ -453,7 +601,7 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else int8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -474,7 +622,13 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_int16x8_private a_[3] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -494,7 +648,7 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -515,7 +669,13 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_int32x4_private a_[3] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -535,7 +695,7 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -556,7 +716,13 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_int64x2_private a[3] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -565,7 +731,7 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -618,6 +784,12 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { v128_t m2 = wasm_i8x16_shuffle(r2, r1, 0, 1, 18, 3, 4, 21, 6, 7, 24, 9, 10, 27, 12, 13, 30, 15); wasm_v128_store(ptr + 32, m2); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, @@ -643,7 +815,7 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else uint8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -665,7 +837,13 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -685,7 +863,7 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -707,7 +885,13 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -727,7 +911,7 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -748,7 +932,13 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_uint64x2_private a[3] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -757,7 +947,7 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -769,6 +959,223 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { #define vst3q_u64(a, b) simde_vst3q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3_p8(ptr, val); + #else + simde_poly8x8_private a_[3] = { simde_poly8x8_to_private(val.val[0]), + simde_poly8x8_to_private(val.val[1]), + simde_poly8x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #else + simde_poly8_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_p8 + #define vst3_p8(a, b) simde_vst3_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3_p16(ptr, val); + #else + simde_poly16x4_private a_[3] = { simde_poly16x4_to_private(val.val[0]), + simde_poly16x4_to_private(val.val[1]), + simde_poly16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #else + simde_poly16_t buf[12]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_p16 + #define vst3_p16(a, b) simde_vst3_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst3_p64(ptr, val); + #else + simde_poly64x1_private a_[3] = { simde_poly64x1_to_private(val.val[0]), + simde_poly64x1_to_private(val.val[1]), + simde_poly64x1_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3_p64 + #define vst3_p64(a, b) simde_vst3_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst3q_p8(ptr, val); + #else + simde_poly8x16_private a_[3] = {simde_poly8x16_to_private(val.val[0]), + simde_poly8x16_to_private(val.val[1]), + simde_poly8x16_to_private(val.val[2])}; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); + #else + simde_poly8_t buf[48]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_p8 + #define vst3q_p8(a, b) simde_vst3q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3q_p16(ptr, val); + #else + simde_poly16x8_private a_[3] = { simde_poly16x8_to_private(val.val[0]), + simde_poly16x8_to_private(val.val[1]), + simde_poly16x8_to_private(val.val[2]) }; + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #else + simde_poly16_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_p16 + #define vst3q_p16(a, b) simde_vst3q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst3q_p64(ptr, val); + #else + simde_poly64x2_private a_[3] = { simde_poly64x2_to_private(val.val[0]), + simde_poly64x2_to_private(val.val[1]), + simde_poly64x2_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #else + simde_poly64_t buf[6]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_p64 + #define vst3q_p64(a, b) simde_vst3q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst3_bf16(ptr, val); + #else + simde_bfloat16x4_private a[3] = { simde_bfloat16x4_to_private(val.val[0]), + simde_bfloat16x4_to_private(val.val[1]), + simde_bfloat16x4_to_private(val.val[2]) }; + simde_bfloat16_t buf[12]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst3_bf16 + #define vst3_bf16(a, b) simde_vst3_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst3q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[3] = { simde_bfloat16x8_to_private(val.val[0]), + simde_bfloat16x8_to_private(val.val[1]), + simde_bfloat16x8_to_private(val.val[2]) }; + simde_bfloat16_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst3q_bf16 + #define vst3q_bf16(a, b) simde_vst3q_bf16((a), (b)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/st3_lane.h b/arm/neon/st3_lane.h index ba3283b24..e07ce6948 100644 --- a/arm/neon/st3_lane.h +++ b/arm/neon/st3_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST3_LANE_H) @@ -189,6 +190,26 @@ simde_vst3_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val, #define vst3_lane_u64(a, b, c) simde_vst3_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst3_lane_f16 + #define vst3_lane_f16(a, b, c) simde_vst3_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x2x3_t val, const int lane) @@ -380,6 +401,26 @@ simde_vst3q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x2x3_t val #define vst3q_lane_u64(a, b, c) simde_vst3q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst3q_lane_f16 + #define vst3q_lane_f16(a, b, c) simde_vst3q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4x3_t val, const int lane) @@ -401,7 +442,7 @@ simde_vst3q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4 SIMDE_FUNCTION_ATTRIBUTES void -simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t val, const int lane){ +simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t val, const int lane) { //SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_CONSTIFY_2_NO_RESULT_(vst3q_lane_f64, HEDLEY_UNREACHABLE(), lane, ptr, val); @@ -418,6 +459,161 @@ simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2 #define vst3q_lane_f64(a, b, c) simde_vst3q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p8 + #define vst3_lane_p8(a, b, c) simde_vst3_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p16 + #define vst3_lane_p16(a, b, c) simde_vst3_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + HEDLEY_STATIC_CAST(void, lane); + vst3_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p64 + #define vst3_lane_p64(a, b, c) simde_vst3_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x16x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst3q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p8 + #define vst3q_lane_p8(a, b, c) simde_vst3q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p16 + #define vst3q_lane_p16(a, b, c) simde_vst3q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x2x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst3q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p64 + #define vst3q_lane_p64(a, b, c) simde_vst3q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst3_lane_bf16 + #define vst3_lane_bf16(a, b, c) simde_vst3_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst3q_lane_bf16 + #define vst3q_lane_bf16(a, b, c) simde_vst3q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/st4.h b/arm/neon/st4.h index 2ccb1c3dd..7a95b62ae 100644 --- a/arm/neon/st4.h +++ b/arm/neon/st4.h @@ -23,12 +23,14 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST4_H) #define SIMDE_ARM_NEON_ST4_H -#include "types.h" +#include "combine.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -36,19 +38,59 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_f16(simde_float16_t *ptr, simde_float16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst4_f16(ptr, val); + #else + simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst4_f16 + #define vst4_f16(a, b) simde_vst4_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4_f32(simde_float32_t *ptr, simde_float32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_f32(ptr, val); #else - simde_float32_t buf[8]; simde_float32x2_private a_[4] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]), simde_float32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 2); + #else + simde_float32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -62,13 +104,22 @@ simde_vst4_f64(simde_float64_t *ptr, simde_float64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x1_private a_[4] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]), simde_float64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_f64m1x4(ptr, dest, 1); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -82,13 +133,22 @@ simde_vst4_s8(int8_t *ptr, simde_int8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s8(ptr, val); #else - int8_t buf[32]; simde_int8x8_private a_[4] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]), simde_int8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_i8m1x4(ptr, dest, 8); + #else + int8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -102,13 +162,22 @@ simde_vst4_s16(int16_t *ptr, simde_int16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s16(ptr, val); #else - int16_t buf[16]; simde_int16x4_private a_[4] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]), simde_int16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 4); + #else + int16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -122,13 +191,22 @@ simde_vst4_s32(int32_t *ptr, simde_int32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s32(ptr, val); #else - int32_t buf[8]; simde_int32x2_private a_[4] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]), simde_int32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 2); + #else + int32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -142,16 +220,25 @@ simde_vst4_s64(int64_t *ptr, simde_int64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s64(ptr, val); #else - int64_t buf[4]; simde_int64x1_private a_[4] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]), simde_int64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 1); + #else + int64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst4_s64 #define vst4_s64(a, b) simde_vst4_s64((a), (b)) #endif @@ -161,14 +248,44 @@ void simde_vst4_u8(uint8_t *ptr, simde_uint8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u8(ptr, val); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + simde_uint16x8_private r0_, r1_; + simde_uint8x16_private ab_ = simde_uint8x16_to_private(simde_vcombine_u8(val.val[0], val.val[1])); + simde_uint8x16_private cd_ = simde_uint8x16_to_private(simde_vcombine_u8(val.val[2], val.val[3])); + + // Perform the interleaving + r0_.v128 = wasm_i8x16_shuffle(ab_.v128, cd_.v128, + 0, 8, 16, 24, + 1, 9, 17, 25, + 2, 10, 18, 26, + 3, 11, 19, 27 + ); + r1_.v128 = wasm_i8x16_shuffle(ab_.v128, cd_.v128, + 4, 12, 20, 28, + 5, 13, 21, 29, + 6, 14, 22, 30, + 7, 15, 23, 31 + ); + + wasm_v128_store(ptr, r0_.v128); + wasm_v128_store(ptr + sizeof(r0_), r1_.v128); #else - uint8_t buf[32]; simde_uint8x8_private a_[4] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]), simde_uint8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + uint8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -182,13 +299,22 @@ simde_vst4_u16(uint16_t *ptr, simde_uint16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u16(ptr, val); #else - uint16_t buf[16]; simde_uint16x4_private a_[4] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]), simde_uint16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + uint16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -202,13 +328,22 @@ simde_vst4_u32(uint32_t *ptr, simde_uint32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u32(ptr, val); #else - uint32_t buf[8]; simde_uint32x2_private a_[4] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]), simde_uint32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 2); + #else + uint32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -222,33 +357,82 @@ simde_vst4_u64(uint64_t *ptr, simde_uint64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x1_private a_[4] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]), simde_uint64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst4_u64 #define vst4_u64(a, b) simde_vst4_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_f16(simde_float16_t *ptr, simde_float16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst4q_f16(ptr, val); + #else + simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) \ + && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst4q_f16 + #define vst4q_f16(a, b) simde_vst4q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4q_f32(simde_float32_t *ptr, simde_float32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_f32(ptr, val); #else - simde_float32_t buf[16]; simde_float32x4_private a_[4] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]), simde_float32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 4); + #else + simde_float32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -262,13 +446,22 @@ simde_vst4q_f64(simde_float64_t *ptr, simde_float64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_f64(ptr, val); #else - simde_float64_t buf[8]; simde_float64x2_private a_[4] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]), simde_float64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_f64m1x4 (ptr, dest, 2); + #else + simde_float64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -282,13 +475,22 @@ simde_vst4q_s8(int8_t *ptr, simde_int8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s8(ptr, val); #else - int8_t buf[64]; simde_int8x16_private a_[4] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]), simde_int8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_i8m1x4 (ptr, dest, 16); + #else + int8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -302,13 +504,22 @@ simde_vst4q_s16(int16_t *ptr, simde_int16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s16(ptr, val); #else - int16_t buf[32]; simde_int16x8_private a_[4] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]), simde_int16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 8); + #else + int16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -322,13 +533,22 @@ simde_vst4q_s32(int32_t *ptr, simde_int32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s32(ptr, val); #else - int32_t buf[16]; simde_int32x4_private a_[4] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]), simde_int32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 4); + #else + int32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -342,13 +562,22 @@ simde_vst4q_s64(int64_t *ptr, simde_int64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_s64(ptr, val); #else - int64_t buf[8]; simde_int64x2_private a_[4] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]), simde_int64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 2); + #else + int64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -363,13 +592,22 @@ simde_vst4q_u8(uint8_t *ptr, simde_uint8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u8(ptr, val); #else - uint8_t buf[64]; simde_uint8x16_private a_[4] = { simde_uint8x16_to_private(val.val[0]), simde_uint8x16_to_private(val.val[1]), simde_uint8x16_to_private(val.val[2]), simde_uint8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + uint8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -383,13 +621,22 @@ simde_vst4q_u16(uint16_t *ptr, simde_uint16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u16(ptr, val); #else - uint16_t buf[32]; simde_uint16x8_private a_[4] = { simde_uint16x8_to_private(val.val[0]), simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]), simde_uint16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + uint16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -403,13 +650,22 @@ simde_vst4q_u32(uint32_t *ptr, simde_uint32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u32(ptr, val); #else - uint32_t buf[16]; simde_uint32x4_private a_[4] = { simde_uint32x4_to_private(val.val[0]), simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]), simde_uint32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 4); + #else + uint32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -423,18 +679,243 @@ simde_vst4q_u64(uint64_t *ptr, simde_uint64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_u64(ptr, val); #else - uint64_t buf[8]; simde_uint64x2_private a_[4] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]), simde_uint64x2_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + uint64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_u64 + #define vst4q_u64(a, b) simde_vst4q_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p8(simde_poly8_t *ptr, simde_poly8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4_p8(ptr, val); + #else + simde_poly8x8_private a_[4] = { simde_poly8x8_to_private(val.val[0]), simde_poly8x8_to_private(val.val[1]), + simde_poly8x8_to_private(val.val[2]), simde_poly8x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_p8 + #define vst4_p8(a, b) simde_vst4_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p16(simde_poly16_t *ptr, simde_poly16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4_p16(ptr, val); + #else + simde_poly16x4_private a_[4] = { simde_poly16x4_to_private(val.val[0]), simde_poly16x4_to_private(val.val[1]), + simde_poly16x4_to_private(val.val[2]), simde_poly16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_p16 + #define vst4_p16(a, b) simde_vst4_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p64(simde_poly64_t *ptr, simde_poly64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst4_p64(ptr, val); + #else + simde_poly64x1_private a_[4] = { simde_poly64x1_to_private(val.val[0]), simde_poly64x1_to_private(val.val[1]), + simde_poly64x1_to_private(val.val[2]), simde_poly64x1_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4_p64 + #define vst4_p64(a, b) simde_vst4_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p8(simde_poly8_t *ptr, simde_poly8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4q_p8(ptr, val); + #else + simde_poly8x16_private a_[4] = { simde_poly8x16_to_private(val.val[0]), simde_poly8x16_to_private(val.val[1]), + simde_poly8x16_to_private(val.val[2]), simde_poly8x16_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + simde_poly8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_p8 + #define vst4q_p8(a, b) simde_vst4q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p16(simde_poly16_t *ptr, simde_poly16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4q_p16(ptr, val); + #else + simde_poly16x8_private a_[4] = { simde_poly16x8_to_private(val.val[0]), simde_poly16x8_to_private(val.val[1]), + simde_poly16x8_to_private(val.val[2]), simde_poly16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + simde_poly16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_p16 + #define vst4q_p16(a, b) simde_vst4q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p64(simde_poly64_t *ptr, simde_poly64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst4q_p64(ptr, val); + #else + simde_poly64x2_private a_[4] = { simde_poly64x2_to_private(val.val[0]), simde_poly64x2_to_private(val.val[1]), + simde_poly64x2_to_private(val.val[2]), simde_poly64x2_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVLSSEG) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + simde_poly64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_p64 + #define vst4q_p64(a, b) simde_vst4q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst4_bf16(ptr, val); + #else + simde_bfloat16x4_private a_[4] = { simde_bfloat16x4_to_private(val.val[0]), simde_bfloat16x4_to_private(val.val[1]), + simde_bfloat16x4_to_private(val.val[2]), simde_bfloat16x4_to_private(val.val[3]) }; + simde_bfloat16_t buf[16]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { buf[i] = a_[i % 4].values[i / 4]; } simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vst4q_u64 - #define vst4q_u64(a, b) simde_vst4q_u64((a), (b)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst4_bf16 + #define vst4_bf16(a, b) simde_vst4_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst4q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[4] = { simde_bfloat16x8_to_private(val.val[0]), simde_bfloat16x8_to_private(val.val[1]), + simde_bfloat16x8_to_private(val.val[2]), simde_bfloat16x8_to_private(val.val[3]) }; + simde_bfloat16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst4q_bf16 + #define vst4q_bf16(a, b) simde_vst4q_bf16((a), (b)) #endif #endif /* !defined(SIMDE_BUG_INTEL_857088) */ diff --git a/arm/neon/st4_lane.h b/arm/neon/st4_lane.h index e5101e46d..b85078379 100644 --- a/arm/neon/st4_lane.h +++ b/arm/neon/st4_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST4_LANE_H) @@ -190,6 +191,25 @@ simde_vst4_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val, #define vst4_lane_u64(a, b, c) simde_vst4_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vst4_lane_f16(a, b, c) vst4_lane_f16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst4_lane_f16 + #define vst4_lane_f16(a, b, c) simde_vst4_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x4_t val, const int lane) @@ -381,6 +401,26 @@ simde_vst4q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x4_t val #define vst4q_lane_u64(a, b, c) simde_vst4q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vst4q_lane_f16 + #define vst4q_lane_f16(a, b, c) simde_vst4q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4x4_t val, const int lane) @@ -420,6 +460,161 @@ simde_vst4q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x2 #define vst4q_lane_f64(a, b, c) simde_vst4q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p8 + #define vst4_lane_p8(a, b, c) simde_vst4_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst4_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p16 + #define vst4_lane_p16(a, b, c) simde_vst4_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + (void) lane; + vst4_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p64 + #define vst4_lane_p64(a, b, c) simde_vst4_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst4q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p8 + #define vst4q_lane_p8(a, b, c) simde_vst4q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p16 + #define vst4q_lane_p16(a, b, c) simde_vst4q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst4q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p64 + #define vst4q_lane_p64(a, b, c) simde_vst4q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst4_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst4_lane_bf16 + #define vst4_lane_bf16(a, b, c) simde_vst4_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) + #undef vst4q_lane_bf16 + #define vst4q_lane_bf16(a, b, c) simde_vst4q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/sub.h b/arm/neon/sub.h index 85a9d5017..2f7022c09 100644 --- a/arm/neon/sub.h +++ b/arm/neon/sub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SUB_H) @@ -33,6 +34,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_vsubh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsubh_f16(a, b); + #else + simde_float32 af = simde_float16_to_float32(a); + simde_float32 bf = simde_float16_to_float32(b); + return simde_float16_from_float32(af - bf); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsubh_f16 + #define vsubh_f16(a, b) simde_vsubh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES int64_t simde_vsubd_s64(int64_t a, int64_t b) { @@ -61,6 +79,31 @@ simde_vsubd_u64(uint64_t a, uint64_t b) { #define vsubd_u64(a, b) simde_vsubd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vsub_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsub_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsubh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsub_f16 + #define vsub_f16(a, b) simde_vsub_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vsub_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -353,6 +396,33 @@ simde_vsub_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #define vsub_u64(a, b) simde_vsub_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsubq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsubq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ - tmp_b_); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vsubq_f16 + #define vsubq_f16(a, b) simde_vsubq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vsubq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/arm/neon/subhn_high.h b/arm/neon/subhn_high.h new file mode 100644 index 000000000..4a14749a1 --- /dev/null +++ b/arm/neon/subhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SUBHN_HIGH_H) +#define SIMDE_ARM_NEON_SUBHN_HIGH_H + +#include "subhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s16(r, a, b) vsubhn_high_s16((r), (a), (b)) +#else + #define simde_vsubhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vsubhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s16 + #define vsubhn_high_s16(r, a, b) simde_vsubhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s32(r, a, b) vsubhn_high_s32((r), (a), (b)) +#else + #define simde_vsubhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vsubhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s32 + #define vsubhn_high_s32(r, a, b) simde_vsubhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s64(r, a, b) vsubhn_high_s64((r), (a), (b)) +#else + #define simde_vsubhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vsubhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s64 + #define vsubhn_high_s64(r, a, b) simde_vsubhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u16(r, a, b) vsubhn_high_u16((r), (a), (b)) +#else + #define simde_vsubhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vsubhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u16 + #define vsubhn_high_u16(r, a, b) simde_vsubhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u32(r, a, b) vsubhn_high_u32((r), (a), (b)) +#else + #define simde_vsubhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vsubhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u32 + #define vsubhn_high_u32(r, a, b) simde_vsubhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u64(r, a, b) vsubhn_high_u64((r), (a), (b)) +#else + #define simde_vsubhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vsubhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u64 + #define vsubhn_high_u64(r, a, b) simde_vsubhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SUBHN_HIGH_H) */ diff --git a/arm/neon/subl.h b/arm/neon/subl.h index 356bf5610..3ac143f7d 100644 --- a/arm/neon/subl.h +++ b/arm/neon/subl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBL_H) @@ -42,6 +43,12 @@ simde_int16x8_t simde_vsubl_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b)); #endif @@ -56,6 +63,12 @@ simde_int32x4_t simde_vsubl_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b)); #endif @@ -70,6 +83,12 @@ simde_int64x2_t simde_vsubl_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b)); #endif @@ -84,6 +103,12 @@ simde_uint16x8_t simde_vsubl_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b)); #endif @@ -98,6 +123,12 @@ simde_uint32x4_t simde_vsubl_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b)); #endif @@ -112,6 +143,12 @@ simde_uint64x2_t simde_vsubl_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b)); #endif diff --git a/arm/neon/subl_high.h b/arm/neon/subl_high.h index d45f4989b..860cb6e4d 100644 --- a/arm/neon/subl_high.h +++ b/arm/neon/subl_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBL_HIGH_H) @@ -41,6 +42,14 @@ simde_int16x8_t simde_vsubl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); #endif @@ -55,6 +64,14 @@ simde_int32x4_t simde_vsubl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); #endif @@ -69,6 +86,14 @@ simde_int64x2_t simde_vsubl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); #endif @@ -83,6 +108,14 @@ simde_uint16x8_t simde_vsubl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsubu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); #endif @@ -97,6 +130,14 @@ simde_uint32x4_t simde_vsubl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsubu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); #endif @@ -111,6 +152,14 @@ simde_uint64x2_t simde_vsubl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwsubu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); #endif diff --git a/arm/neon/subw.h b/arm/neon/subw.h index 51d6cf4bf..2f44a3529 100644 --- a/arm/neon/subw.h +++ b/arm/neon/subw.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBW_H) @@ -40,14 +41,16 @@ simde_int16x8_t simde_vsubw_s8(simde_int16x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s16(a, simde_vmovl_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -70,14 +73,16 @@ simde_int32x4_t simde_vsubw_s16(simde_int32x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s32(a, simde_vmovl_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -100,14 +105,16 @@ simde_int64x2_t simde_vsubw_s32(simde_int64x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s64(a, simde_vmovl_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -130,14 +137,16 @@ simde_uint16x8_t simde_vsubw_u8(simde_uint16x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u16(a, simde_vmovl_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -160,14 +169,16 @@ simde_uint32x4_t simde_vsubw_u16(simde_uint32x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u32(a, simde_vmovl_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -190,14 +201,16 @@ simde_uint64x2_t simde_vsubw_u32(simde_uint64x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u64(a, simde_vmovl_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else diff --git a/arm/neon/subw_high.h b/arm/neon/subw_high.h index 729a478a7..f48c6ed67 100644 --- a/arm/neon/subw_high.h +++ b/arm/neon/subw_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBW_HIGH_H) @@ -40,14 +41,17 @@ simde_int16x8_t simde_vsubw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s16(a, simde_vmovl_high_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsub_wv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -70,14 +74,17 @@ simde_int32x4_t simde_vsubw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s32(a, simde_vmovl_high_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsub_wv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -100,14 +107,16 @@ simde_int64x2_t simde_vsubw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s64(a, simde_vmovl_high_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4); + r_.sv128 = __riscv_vwsub_wv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -130,14 +139,17 @@ simde_uint16x8_t simde_vsubw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u16(a, simde_vmovl_high_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsubu_wv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv128) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -160,14 +172,17 @@ simde_uint32x4_t simde_vsubw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u32(a, simde_vmovl_high_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsubu_wv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv128) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -190,14 +205,17 @@ simde_uint64x2_t simde_vsubw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u64(a, simde_vmovl_high_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4); + r_.sv128 = __riscv_vwsubu_wv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv128) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else diff --git a/arm/neon/sudot_lane.h b/arm/neon/sudot_lane.h new file mode 100644 index 000000000..5137780ce --- /dev/null +++ b/arm/neon/sudot_lane.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SUDOT_LANE_H) +#define SIMDE_ARM_NEON_SUDOT_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vsudot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudot_lane_s32(r, a, b, lane) vsudot_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vsudot_lane_s32 + #define vsudot_lane_s32(r, a, b, lane) simde_vsudot_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vsudot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudot_laneq_s32(r, a, b, lane) vsudot_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vsudot_laneq_s32 + #define vsudot_laneq_s32(r, a, b, lane) simde_vsudot_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vsudotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudotq_laneq_s32(r, a, b, lane) vsudotq_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vsudotq_laneq_s32 + #define vsudotq_laneq_s32(r, a, b, lane) simde_vsudotq_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vsudotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudotq_lane_s32(r, a, b, lane) vsudotq_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vsudotq_lane_s32 + #define vsudotq_lane_s32(r, a, b, lane) simde_vsudotq_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SUDOT_LANE_H) */ diff --git a/arm/neon/tbl.h b/arm/neon/tbl.h index 224e86d7c..36b7d3c5e 100644 --- a/arm/neon/tbl.h +++ b/arm/neon/tbl.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TBL_H) @@ -59,6 +61,10 @@ simde_vtbl1_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_or_si64(b_.m64, _mm_cmpgt_pi8(b_.m64, _mm_set1_pi8(7)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (b_.sv64, 8, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(a_.sv64 , b_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -106,6 +112,11 @@ simde_vtbl2_u8(simde_uint8x8x2_t a, simde_uint8x8_t b) { __m128i b128 = _mm_set1_epi64(b_.m64); __m128i r128 = _mm_shuffle_epi8(a128, _mm_or_si128(b128, _mm_cmpgt_epi8(b128, _mm_set1_epi8(15)))); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t t_combine = __riscv_vslideup_vx_u8m1(a_[0].sv64 , a_[1].sv64 , 8 , 16); + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (b_.sv64 , 16 , 8); + vuint8m1_t r_tmp = __riscv_vrgather_vv_u8m1(t_combine , b_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_tmp, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -155,6 +166,16 @@ simde_vtbl3_u8(simde_uint8x8x3_t a, simde_uint8x8_t b) { __m128i r128_2 = _mm_shuffle_epi8(_mm_set1_epi64(a_[2].m64), b128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(b128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[2].sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t2 , t3 , 8 , 24); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 24); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(b_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 24, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -204,6 +225,18 @@ simde_vtbl4_u8(simde_uint8x8x4_t a, simde_uint8x8_t b) { __m128i r128_23 = _mm_shuffle_epi8(_mm_set_epi64(a_[3].m64, a_[2].m64), b128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(b128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[2].sv64); + vuint8m2_t t4 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[3].sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t3 , t4 , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t2 , t_combine , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(b_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -235,6 +268,68 @@ simde_vtbl4_s8(simde_int8x8x4_t a, simde_int8x8_t b) { #define vtbl4_s8(a, b) simde_vtbl4_s8((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl1_p8(simde_poly8x8_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl1_p8(a, b); + #else + return simde_vreinterpret_p8_u8(simde_vtbl1_u8(simde_vreinterpret_u8_p8(a), b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl1_p8 + #define vtbl1_p8(a, b) simde_vtbl1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl2_p8(simde_poly8x8x2_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl2_p8(a, b); + #else + simde_uint8x8x2_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl2_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl2_p8 + #define vtbl2_p8(a, b) simde_vtbl2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl3_p8(simde_poly8x8x3_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl3_p8(a, b); + #else + simde_uint8x8x3_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl3_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl3_p8 + #define vtbl3_p8(a, b) simde_vtbl3_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl4_p8(simde_poly8x8x4_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl4_p8(a, b); + #else + simde_uint8x8x4_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl4_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl4_p8 + #define vtbl4_p8(a, b) simde_vtbl4_p8((a), (b)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/tbx.h b/arm/neon/tbx.h index 4e2c639f0..fdd450d4c 100644 --- a/arm/neon/tbx.h +++ b/arm/neon/tbx.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TBX_H) @@ -55,6 +57,10 @@ simde_vtbx1_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { __m128i r128 = _mm_shuffle_epi8(b128, c128); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (c_.sv64, 8, 16); + r_.sv64 = __riscv_vrgather_vv_u8m1(b_.sv64 , c_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_.sv64, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -106,6 +112,11 @@ simde_vtbx2_u8(simde_uint8x8_t a, simde_uint8x8x2_t b, simde_uint8x8_t c) { __m128i r128 = _mm_shuffle_epi8(b128, c128); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t t_combine = __riscv_vslideup_vx_u8m1(b_[0].sv64 , b_[1].sv64 , 8 , 16); + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (c_.sv64 , 16 , 8); + vuint8m1_t r_tmp = __riscv_vrgather_vv_u8m1(t_combine , c_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_tmp, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -160,6 +171,17 @@ simde_vtbx3_u8(simde_uint8x8_t a, simde_uint8x8x3_t b, simde_uint8x8_t c) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(c128, 3)); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[2].sv64); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t2 , t3 , 8 , 24); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 24); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(c_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 24, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -214,6 +236,19 @@ simde_vtbx4_u8(simde_uint8x8_t a, simde_uint8x8x4_t b, simde_uint8x8_t c) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(c128, 3)); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[2].sv64); + vuint8m2_t t4 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[3].sv64); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t3 , t4 , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t2 , t_combine , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(c_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -247,6 +282,74 @@ simde_vtbx4_s8(simde_int8x8_t a, simde_int8x8x4_t b, simde_int8x8_t c) { #define vtbx4_s8(a, b, c) simde_vtbx4_s8((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx1_p8(simde_poly8x8_t a, simde_poly8x8_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx1_p8(a, b, c); + #else + return simde_vreinterpret_p8_u8(simde_vtbx1_u8(simde_vreinterpret_u8_p8(a), simde_vreinterpret_u8_p8(b), c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx1_p8 + #define vtbx1_p8(a, b, c) simde_vtbx1_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx2_p8(simde_poly8x8_t a, simde_poly8x8x2_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx2_p8(a, b, c); + #else + simde_uint8x8x2_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx2_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx2_p8 + #define vtbx2_p8(a, b, c) simde_vtbx2_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx3_p8(simde_poly8x8_t a, simde_poly8x8x3_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx3_p8(a, b, c); + #else + simde_uint8x8x3_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx3_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx3_p8 + #define vtbx3_p8(a, b, c) simde_vtbx3_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx4_p8(simde_poly8x8_t a, simde_poly8x8x4_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx4_p8(a, b, c); + #else + simde_uint8x8x4_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx4_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx4_p8 + #define vtbx4_p8(a, b, c) simde_vtbx4_p8((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/arm/neon/trn.h b/arm/neon/trn.h index 9f9184849..3d5149eaa 100644 --- a/arm/neon/trn.h +++ b/arm/neon/trn.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,22 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vtrn_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vtrn1_f16(a, b), simde_vtrn2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrn_f16 + #define vtrn_f16(a, b) simde_vtrn_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vtrn_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +158,22 @@ simde_vtrn_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn_u32(a, b) simde_vtrn_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vtrnq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrnq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vtrn1q_f16(a, b), simde_vtrn2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrnq_f16 + #define vtrnq_f16(a, b) simde_vtrnq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vtrnq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +279,66 @@ simde_vtrnq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vtrnq_u32(a, b) simde_vtrnq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vtrn_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrn_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vtrn1_p8(a, b), simde_vtrn2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrn_p8 + #define vtrn_p8(a, b) simde_vtrn_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vtrn_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrn_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vtrn1_p16(a, b), simde_vtrn2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrn_p16 + #define vtrn_p16(a, b) simde_vtrn_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vtrnq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrnq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vtrn1q_p8(a, b), simde_vtrn2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrnq_p8 + #define vtrnq_p8(a, b) simde_vtrnq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vtrnq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrnq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vtrn1q_p16(a, b), simde_vtrn2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrnq_p16 + #define vtrnq_p16(a, b) simde_vtrnq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/trn1.h b/arm/neon/trn1.h index f3b1521aa..ba01c1ef5 100644 --- a/arm/neon/trn1.h +++ b/arm/neon/trn1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN1_H) @@ -34,6 +35,34 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vtrn1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrn1_f16 + #define vtrn1_f16(a, b) simde_vtrn1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vtrn1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -223,6 +252,34 @@ simde_vtrn1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn1_u32(a, b) simde_vtrn1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vtrn1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn1q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrn1q_f16 + #define vtrn1q_f16(a, b) simde_vtrn1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vtrn1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -494,6 +551,141 @@ simde_vtrn1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vtrn1q_u64(a, b) simde_vtrn1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtrn1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1_p8 + #define vtrn1_p8(a, b) simde_vtrn1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vtrn1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1_p16 + #define vtrn1_p16(a, b) simde_vtrn1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vtrn1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p8 + #define vtrn1q_p8(a, b) simde_vtrn1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vtrn1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p16 + #define vtrn1q_p16(a, b) simde_vtrn1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vtrn1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p64 + #define vtrn1q_p64(a, b) simde_vtrn1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/trn2.h b/arm/neon/trn2.h index 31bd7dc4e..ad6f1fba1 100644 --- a/arm/neon/trn2.h +++ b/arm/neon/trn2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN2_H) @@ -34,6 +35,34 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vtrn2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrn2_f16 + #define vtrn2_f16(a, b) simde_vtrn2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vtrn2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -223,6 +252,34 @@ simde_vtrn2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn2_u32(a, b) simde_vtrn2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vtrn2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn2q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vtrn2q_f16 + #define vtrn2q_f16(a, b) simde_vtrn2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vtrn2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -493,6 +550,141 @@ simde_vtrn2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vtrn2q_u64(a, b) simde_vtrn2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtrn2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2_p8 + #define vtrn2_p8(a, b) simde_vtrn2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vtrn2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2_p16 + #define vtrn2_p16(a, b) simde_vtrn2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vtrn2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p8 + #define vtrn2q_p8(a, b) simde_vtrn2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vtrn2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p16 + #define vtrn2q_p16(a, b) simde_vtrn2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vtrn2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p64 + #define vtrn2q_p64(a, b) simde_vtrn2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/tst.h b/arm/neon/tst.h index 243444622..fdc146d27 100644 --- a/arm/neon/tst.h +++ b/arm/neon/tst.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TST_H) @@ -562,6 +563,102 @@ simde_vtst_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #define vtst_u64(a, b) simde_vtst_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vtst_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtst_p8(a, b); + #else + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + simde_uint8x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT8_MAX : 0; + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtst_p8 + #define vtst_p8(a, b) simde_vtst_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vtst_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtst_p64(a, b); + #else + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT64_MAX : 0; + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtst_p64 + #define vtst_p64(a, b) simde_vtst_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vtstq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtstq_p8(a, b); + #else + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + simde_uint8x16_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT8_MAX : 0; + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtstq_p8 + #define vtstq_p8(a, b) simde_vtstq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vtstq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtstq_p64(a, b); + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT64_MAX : 0; + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtstq_p64 + #define vtstq_p64(a, b) simde_vtstq_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/types.h b/arm/neon/types.h index 12bce8b87..e3da39be5 100644 --- a/arm/neon/types.h +++ b/arm/neon/types.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TYPES_H) @@ -29,6 +31,7 @@ #include "../../simde-common.h" #include "../../simde-f16.h" +#include "../../simde-bf16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -46,6 +49,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv64; + #endif + } simde_int8x8_private; typedef union { @@ -54,6 +62,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv64; + #endif + } simde_int16x4_private; typedef union { @@ -62,6 +75,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv64; + #endif + } simde_int32x2_private; typedef union { @@ -70,6 +88,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv64; + #endif + } simde_int64x1_private; typedef union { @@ -78,6 +101,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif + } simde_uint8x8_private; typedef union { @@ -86,6 +114,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif + } simde_uint16x4_private; typedef union { @@ -94,6 +127,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv64; + #endif + } simde_uint32x2_private; typedef union { @@ -102,6 +140,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif + } simde_uint64x1_private; typedef union { @@ -114,6 +157,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv64; + #endif + } simde_float16x4_private; typedef union { @@ -122,6 +170,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv64; + #endif + } simde_float32x2_private; typedef union { @@ -130,8 +183,34 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv64; + #endif + } simde_float64x1_private; +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif +} simde_poly8x8_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif +} simde_poly16x4_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif +} simde_poly64x1_private; + typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(int8_t, values, 16); @@ -146,6 +225,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv128; + #endif + } simde_int8x16_private; typedef union { @@ -162,6 +246,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv128; + #endif + } simde_int16x8_private; typedef union { @@ -182,6 +271,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv128; + #endif + } simde_int32x4_private; typedef union { @@ -198,6 +292,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv128; + #endif + } simde_int64x2_private; typedef union { @@ -214,6 +313,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif + } simde_uint8x16_private; typedef union { @@ -230,6 +334,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif + } simde_uint16x8_private; typedef union { @@ -246,6 +355,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv128; + #endif + } simde_uint32x4_private; typedef union { @@ -262,6 +376,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif + } simde_uint64x2_private; typedef union { @@ -271,7 +390,7 @@ typedef union { simde_float16 values[8]; #endif - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) __m128 m128; #endif @@ -282,12 +401,17 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv128; + #endif + } simde_float16x8_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_float32, values, 16); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) __m128 m128; #endif @@ -298,12 +422,17 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv128; + #endif + } simde_float32x4_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_float64, values, 16); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) __m128d m128d; #endif @@ -314,10 +443,54 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv128; + #endif + } simde_float64x2_private; +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif +} simde_poly8x16_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif +} simde_poly16x8_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif +} simde_poly64x2_private; + +typedef union { + #if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_bfloat16, values, 8); + #else + simde_bfloat16 values[4]; + #endif +} simde_bfloat16x4_private; + +typedef union { + #if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_bfloat16, values, 16); + #else + simde_bfloat16 values[8]; + #endif +} simde_bfloat16x8_private; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) typedef float32_t simde_float32_t; + typedef poly8_t simde_poly8_t; + typedef poly16_t simde_poly16_t; typedef int8x8_t simde_int8x8_t; typedef int16x4_t simde_int16x4_t; @@ -328,6 +501,8 @@ typedef union { typedef uint32x2_t simde_uint32x2_t; typedef uint64x1_t simde_uint64x1_t; typedef float32x2_t simde_float32x2_t; + typedef poly8x8_t simde_poly8x8_t; + typedef poly16x4_t simde_poly16x4_t; typedef int8x16_t simde_int8x16_t; typedef int16x8_t simde_int16x8_t; @@ -338,6 +513,8 @@ typedef union { typedef uint32x4_t simde_uint32x4_t; typedef uint64x2_t simde_uint64x2_t; typedef float32x4_t simde_float32x4_t; + typedef poly8x16_t simde_poly8x16_t; + typedef poly16x8_t simde_poly16x8_t; typedef int8x8x2_t simde_int8x8x2_t; typedef int16x4x2_t simde_int16x4x2_t; @@ -348,6 +525,8 @@ typedef union { typedef uint32x2x2_t simde_uint32x2x2_t; typedef uint64x1x2_t simde_uint64x1x2_t; typedef float32x2x2_t simde_float32x2x2_t; + typedef poly8x8x2_t simde_poly8x8x2_t; + typedef poly16x4x2_t simde_poly16x4x2_t; typedef int8x16x2_t simde_int8x16x2_t; typedef int16x8x2_t simde_int16x8x2_t; @@ -358,6 +537,8 @@ typedef union { typedef uint32x4x2_t simde_uint32x4x2_t; typedef uint64x2x2_t simde_uint64x2x2_t; typedef float32x4x2_t simde_float32x4x2_t; + typedef poly8x16x2_t simde_poly8x16x2_t; + typedef poly16x8x2_t simde_poly16x8x2_t; typedef int8x8x3_t simde_int8x8x3_t; typedef int16x4x3_t simde_int16x4x3_t; @@ -368,6 +549,8 @@ typedef union { typedef uint32x2x3_t simde_uint32x2x3_t; typedef uint64x1x3_t simde_uint64x1x3_t; typedef float32x2x3_t simde_float32x2x3_t; + typedef poly8x8x3_t simde_poly8x8x3_t; + typedef poly16x4x3_t simde_poly16x4x3_t; typedef int8x16x3_t simde_int8x16x3_t; typedef int16x8x3_t simde_int16x8x3_t; @@ -378,6 +561,8 @@ typedef union { typedef uint32x4x3_t simde_uint32x4x3_t; typedef uint64x2x3_t simde_uint64x2x3_t; typedef float32x4x3_t simde_float32x4x3_t; + typedef poly8x16x3_t simde_poly8x16x3_t; + typedef poly16x8x3_t simde_poly16x8x3_t; typedef int8x8x4_t simde_int8x8x4_t; typedef int16x4x4_t simde_int16x4x4_t; @@ -388,6 +573,8 @@ typedef union { typedef uint32x2x4_t simde_uint32x2x4_t; typedef uint64x1x4_t simde_uint64x1x4_t; typedef float32x2x4_t simde_float32x2x4_t; + typedef poly8x8x4_t simde_poly8x8x4_t; + typedef poly16x4x4_t simde_poly16x4x4_t; typedef int8x16x4_t simde_int8x16x4_t; typedef int16x8x4_t simde_int16x8x4_t; @@ -398,6 +585,55 @@ typedef union { typedef uint32x4x4_t simde_uint32x4x4_t; typedef uint64x2x4_t simde_uint64x2x4_t; typedef float32x4x4_t simde_float32x4x4_t; + typedef poly8x16x4_t simde_poly8x16x4_t; + typedef poly16x8x4_t simde_poly16x8x4_t; + + #if defined(SIMDE_ARM_NEON_FP16) + typedef float16_t simde_float16_t; + typedef float16x4_t simde_float16x4_t; + typedef float16x4x2_t simde_float16x4x2_t; + typedef float16x4x3_t simde_float16x4x3_t; + typedef float16x4x4_t simde_float16x4x4_t; + typedef float16x8_t simde_float16x8_t; + typedef float16x8x2_t simde_float16x8x2_t; + typedef float16x8x3_t simde_float16x8x3_t; + typedef float16x8x4_t simde_float16x8x4_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 + #endif + + #if defined(SIMDE_ARM_NEON_BF16) + typedef bfloat16_t simde_bfloat16_t; + typedef bfloat16x4_t simde_bfloat16x4_t; + typedef bfloat16x4x2_t simde_bfloat16x4x2_t; + typedef bfloat16x4x3_t simde_bfloat16x4x3_t; + typedef bfloat16x4x4_t simde_bfloat16x4x4_t; + typedef bfloat16x8_t simde_bfloat16x8_t; + typedef bfloat16x8x2_t simde_bfloat16x8x2_t; + typedef bfloat16x8x3_t simde_bfloat16x8x3_t; + typedef bfloat16x8x4_t simde_bfloat16x8x4_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 + #endif + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + typedef poly64_t simde_poly64_t; + typedef poly64x1_t simde_poly64x1_t; + typedef poly64x2_t simde_poly64x2_t; + typedef poly64x1x2_t simde_poly64x1x2_t; + typedef poly64x2x2_t simde_poly64x2x2_t; + typedef poly64x1x3_t simde_poly64x1x3_t; + typedef poly64x2x3_t simde_poly64x2x3_t; + typedef poly64x1x4_t simde_poly64x1x4_t; + typedef poly64x2x4_t simde_poly64x2x4_t; + #if defined(SIMDE_ARCH_ARM_CRYPTO) + typedef poly128_t simde_poly128_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #endif + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #endif #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) typedef float64_t simde_float64_t; @@ -417,16 +653,15 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #endif - #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 - typedef float16_t simde_float16_t; - typedef float16x4_t simde_float16x4_t; - typedef float16x8_t simde_float16x8_t; - #else - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 - #endif #elif (defined(SIMDE_X86_MMX_NATIVE) || defined(SIMDE_X86_SSE_NATIVE)) && defined(SIMDE_ARM_NEON_FORCE_NATIVE_TYPES) + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN @@ -462,7 +697,7 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F32X4 #endif - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) typedef __m128i simde_int8x16_t; typedef __m128i simde_int16x8_t; typedef __m128i simde_int32x4_t; @@ -483,11 +718,14 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_U64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2 #endif - - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_ARM_NEON_FORCE_NATIVE_TYPES) #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_64BIT @@ -507,8 +745,14 @@ typedef union { typedef v128_t simde_float32x4_t; typedef v128_t simde_float64x2_t; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_64BIT #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN @@ -531,9 +775,43 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_I64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_U64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2 - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #endif +#elif defined(SIMDE_RISCV_V_NATIVE) + + typedef fixed_vint8m1_t simde_int8x8_t; + typedef fixed_vint16m1_t simde_int16x4_t; + typedef fixed_vint32m1_t simde_int32x2_t; + typedef fixed_vint64m1_t simde_int64x1_t; + typedef fixed_vuint8m1_t simde_uint8x8_t; + typedef fixed_vuint16m1_t simde_uint16x4_t; + typedef fixed_vuint32m1_t simde_uint32x2_t; + typedef fixed_vuint64m1_t simde_uint64x1_t; + typedef fixed_vfloat32m1_t simde_float32x2_t; + typedef fixed_vfloat64m1_t simde_float64x1_t; + + typedef fixed_vint8m1_t simde_int8x16_t; + typedef fixed_vint16m1_t simde_int16x8_t; + typedef fixed_vint32m1_t simde_int32x4_t; + typedef fixed_vint64m1_t simde_int64x2_t; + typedef fixed_vuint8m1_t simde_uint8x16_t; + typedef fixed_vuint16m1_t simde_uint16x8_t; + typedef fixed_vuint32m1_t simde_uint32x4_t; + typedef fixed_vuint64m1_t simde_uint64x2_t; + typedef fixed_vfloat32m1_t simde_float32x4_t; + typedef fixed_vfloat64m1_t simde_float64x2_t; + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 + #elif defined(SIMDE_VECTOR) typedef simde_float32 simde_float32_t; typedef simde_float64 simde_float64_t; @@ -562,14 +840,42 @@ typedef union { typedef simde_float16 simde_float16_t; typedef simde_float16_t simde_float16x4_t SIMDE_VECTOR(8); typedef simde_float16_t simde_float16x8_t SIMDE_VECTOR(16); + typedef struct simde_float16x4x2_t { + simde_float16x4_t val[2]; + } simde_float16x4x2_t; + typedef struct simde_float16x4x3_t { + simde_float16x4_t val[3]; + } simde_float16x4x3_t; + typedef struct simde_float16x4x4_t { + simde_float16x4_t val[4]; + } simde_float16x4x4_t; + typedef struct simde_float16x8x2_t { + simde_float16x8_t val[2]; + } simde_float16x8x2_t; + typedef struct simde_float16x8x3_t { + simde_float16x8_t val[3]; + } simde_float16x8x3_t; + typedef struct simde_float16x8x4_t { + simde_float16x8_t val[4]; + } simde_float16x8x4_t; #else #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #endif + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 @@ -581,6 +887,114 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #endif +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY) + typedef simde_poly8 simde_poly8_t; + typedef simde_poly16 simde_poly16_t; + + typedef simde_poly8x8_private simde_poly8x8_t; + typedef simde_poly16x4_private simde_poly16x4_t; + typedef simde_poly8x16_private simde_poly8x16_t; + typedef simde_poly16x8_private simde_poly16x8_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT) + typedef simde_poly64 simde_poly64_t; + typedef simde_poly64x1_private simde_poly64x1_t; + typedef simde_poly64x2_private simde_poly64x2_t; + typedef struct simde_poly64x1x2_t { + simde_poly64x1_t val[2]; + } simde_poly64x1x2_t; + typedef struct simde_poly64x2x2_t { + simde_poly64x2_t val[2]; + } simde_poly64x2x2_t; + typedef struct simde_poly64x1x3_t { + simde_poly64x1_t val[3]; + } simde_poly64x1x3_t; + typedef struct simde_poly64x2x3_t { + simde_poly64x2_t val[3]; + } simde_poly64x2x3_t; + typedef struct simde_poly64x1x4_t { + simde_poly64x1_t val[4]; + } simde_poly64x1x4_t; + typedef struct simde_poly64x2x4_t { + simde_poly64x2_t val[4]; + } simde_poly64x2x4_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT) + typedef simde_poly128 simde_poly128_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN) + typedef struct simde_poly8x8x2_t { + simde_poly8x8_t val[2]; + } simde_poly8x8x2_t; + typedef struct simde_poly16x4x2_t { + simde_poly16x4_t val[2]; + } simde_poly16x4x2_t; + typedef struct simde_poly8x16x2_t { + simde_poly8x16_t val[2]; + } simde_poly8x16x2_t; + typedef struct simde_poly16x8x2_t { + simde_poly16x8_t val[2]; + } simde_poly16x8x2_t; + + typedef struct simde_poly8x8x3_t { + simde_poly8x8_t val[3]; + } simde_poly8x8x3_t; + typedef struct simde_poly16x4x3_t { + simde_poly16x4_t val[3]; + } simde_poly16x4x3_t; + typedef struct simde_poly8x16x3_t { + simde_poly8x16_t val[3]; + } simde_poly8x16x3_t; + typedef struct simde_poly16x8x3_t { + simde_poly16x8_t val[3]; + } simde_poly16x8x3_t; + + typedef struct simde_poly8x8x4_t { + simde_poly8x8_t val[4]; + } simde_poly8x8x4_t; + typedef struct simde_poly16x4x4_t { + simde_poly16x4_t val[4]; + } simde_poly16x4x4_t; + typedef struct simde_poly8x16x4_t { + simde_poly8x16_t val[4]; + } simde_poly8x16x4_t; + typedef struct simde_poly16x8x4_t { + simde_poly16x8_t val[4]; + } simde_poly16x8x4_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_BF16) + typedef simde_bfloat16 simde_bfloat16_t; + typedef simde_bfloat16x4_private simde_bfloat16x4_t; + typedef simde_bfloat16x8_private simde_bfloat16x8_t; + typedef struct simde_bfloat16x4x2_t { + simde_bfloat16x4_t val[2]; + } simde_bfloat16x4x2_t; + + typedef struct simde_bfloat16x8x2_t { + simde_bfloat16x8_t val[2]; + } simde_bfloat16x8x2_t; + + typedef struct simde_bfloat16x4x3_t { + simde_bfloat16x4_t val[3]; + } simde_bfloat16x4x3_t; + + typedef struct simde_bfloat16x8x3_t { + simde_bfloat16x8_t val[3]; + } simde_bfloat16x8x3_t; + + typedef struct simde_bfloat16x4x4_t { + simde_bfloat16x4_t val[4]; + } simde_bfloat16x4x4_t; + + typedef struct simde_bfloat16x8x4_t { + simde_bfloat16x8_t val[4]; + } simde_bfloat16x8x4_t; +#endif + #if defined(SIMDE_ARM_NEON_NEED_PORTABLE_I8X8) || defined(SIMDE_ARM_NEON_NEED_PORTABLE_64BIT) typedef simde_int8x8_private simde_int8x8_t; #endif @@ -647,6 +1061,25 @@ typedef union { typedef simde_float16 simde_float16_t; typedef simde_float16x4_private simde_float16x4_t; typedef simde_float16x8_private simde_float16x8_t; + + typedef struct simde_float16x4x2_t { + simde_float16x4_t val[2]; + } simde_float16x4x2_t; + typedef struct simde_float16x4x3_t { + simde_float16x4_t val[3]; + } simde_float16x4x3_t; + typedef struct simde_float16x4x4_t { + simde_float16x4_t val[4]; + } simde_float16x4x4_t; + typedef struct simde_float16x8x2_t { + simde_float16x8_t val[2]; + } simde_float16x8x2_t; + typedef struct simde_float16x8x3_t { + simde_float16x8_t val[3]; + } simde_float16x8x3_t; + typedef struct simde_float16x8x4_t { + simde_float16x8_t val[4]; + } simde_float16x8x4_t; #endif #if defined(SIMDE_ARM_NEON_NEED_PORTABLE_F32) typedef simde_float32 simde_float32_t; @@ -854,10 +1287,18 @@ typedef union { #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - typedef simde_float16_t float16_t; + typedef simde_float16_t float16_t; + typedef simde_float16x4x2_t float16x4x2_t; + typedef simde_float16x4x3_t float16x4x3_t; + typedef simde_float16x4x4_t float16x4x4_t; + typedef simde_float16x8x2_t float16x8x2_t; + typedef simde_float16x8x3_t float16x8x3_t; + typedef simde_float16x8x4_t float16x8x4_t; #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) typedef simde_float32_t float32_t; + typedef simde_poly8_t poly8_t; + typedef simde_poly16_t poly16_t; typedef simde_int8x8_t int8x8_t; typedef simde_int16x4_t int16x4_t; @@ -868,6 +1309,8 @@ typedef union { typedef simde_uint32x2_t uint32x2_t; typedef simde_uint64x1_t uint64x1_t; typedef simde_float32x2_t float32x2_t; + typedef simde_poly8x8_t poly8x8_t; + typedef simde_poly16x4_t poly16x4_t; typedef simde_int8x16_t int8x16_t; typedef simde_int16x8_t int16x8_t; @@ -878,6 +1321,8 @@ typedef union { typedef simde_uint32x4_t uint32x4_t; typedef simde_uint64x2_t uint64x2_t; typedef simde_float32x4_t float32x4_t; + typedef simde_poly8x16_t poly8x16_t; + typedef simde_poly16x8_t poly16x8_t; typedef simde_int8x8x2_t int8x8x2_t; typedef simde_int16x4x2_t int16x4x2_t; @@ -888,6 +1333,8 @@ typedef union { typedef simde_uint32x2x2_t uint32x2x2_t; typedef simde_uint64x1x2_t uint64x1x2_t; typedef simde_float32x2x2_t float32x2x2_t; + typedef simde_poly8x8x2_t poly8x8x2_t; + typedef simde_poly16x4x2_t poly16x4x2_t; typedef simde_int8x16x2_t int8x16x2_t; typedef simde_int16x8x2_t int16x8x2_t; @@ -898,6 +1345,8 @@ typedef union { typedef simde_uint32x4x2_t uint32x4x2_t; typedef simde_uint64x2x2_t uint64x2x2_t; typedef simde_float32x4x2_t float32x4x2_t; + typedef simde_poly8x16x2_t poly8x16x2_t; + typedef simde_poly16x8x2_t poly16x8x2_t; typedef simde_int8x8x3_t int8x8x3_t; typedef simde_int16x4x3_t int16x4x3_t; @@ -908,6 +1357,8 @@ typedef union { typedef simde_uint32x2x3_t uint32x2x3_t; typedef simde_uint64x1x3_t uint64x1x3_t; typedef simde_float32x2x3_t float32x2x3_t; + typedef simde_poly8x8x3_t poly8x8x3_t; + typedef simde_poly16x4x3_t poly16x4x3_t; typedef simde_int8x16x3_t int8x16x3_t; typedef simde_int16x8x3_t int16x8x3_t; @@ -918,6 +1369,8 @@ typedef union { typedef simde_uint32x4x3_t uint32x4x3_t; typedef simde_uint64x2x3_t uint64x2x3_t; typedef simde_float32x4x3_t float32x4x3_t; + typedef simde_poly8x16x3_t poly8x16x3_t; + typedef simde_poly16x8x3_t poly16x8x3_t; typedef simde_int8x8x4_t int8x8x4_t; typedef simde_int16x4x4_t int16x4x4_t; @@ -928,6 +1381,8 @@ typedef union { typedef simde_uint32x2x4_t uint32x2x4_t; typedef simde_uint64x1x4_t uint64x1x4_t; typedef simde_float32x2x4_t float32x2x4_t; + typedef simde_poly8x8x4_t poly8x8x4_t; + typedef simde_poly16x4x4_t poly16x4x4_t; typedef simde_int8x16x4_t int8x16x4_t; typedef simde_int16x8x4_t int16x8x4_t; @@ -938,6 +1393,18 @@ typedef union { typedef simde_uint32x4x4_t uint32x4x4_t; typedef simde_uint64x2x4_t uint64x2x4_t; typedef simde_float32x4x4_t float32x4x4_t; + typedef simde_poly8x16x4_t poly8x16x4_t; + typedef simde_poly16x8x4_t poly16x8x4_t; +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + typedef simde_poly64x1_t poly64x1_t; + typedef simde_poly64x2_t poly64x2_t; + typedef simde_poly64x1x2_t poly64x1x2_t; + typedef simde_poly64x2x2_t poly64x2x2_t; + typedef simde_poly64x1x3_t poly64x1x3_t; + typedef simde_poly64x2x3_t poly64x2x3_t; + typedef simde_poly64x1x4_t poly64x1x4_t; + typedef simde_poly64x2x4_t poly64x2x4_t; #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) typedef simde_float64_t float64_t; @@ -979,7 +1446,7 @@ typedef union { SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32x4_to_m128, __m128, simde_float32x4_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32x4_from_m128, simde_float32x4_t, __m128) #endif -#if defined(SIMDE_X86_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int8x16_to_m128i, __m128i, simde_int8x16_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int16x8_to_m128i, __m128i, simde_int16x8_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int32x4_to_m128i, __m128i, simde_int32x4_t) @@ -1039,6 +1506,10 @@ SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint64x1) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float16x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float32x2) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float64x1) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly8x8) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly16x4) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly64x1) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(bfloat16x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int8x16) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int32x4) @@ -1047,9 +1518,13 @@ SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint8x16) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint32x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint64x2) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly8x16) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly16x8) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly64x2) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float32x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float64x2) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(bfloat16x8) SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/uqadd.h b/arm/neon/uqadd.h index 576fbb576..42535de5e 100644 --- a/arm/neon/uqadd.h +++ b/arm/neon/uqadd.h @@ -33,6 +33,18 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +// Workaround on ARM64 windows due to windows SDK bug +// https://developercommunity.visualstudio.com/t/In-arm64_neonh-vsqaddb_u8-vsqaddh_u16/10271747?sort=newest +#if (defined _MSC_VER) && (defined SIMDE_ARM_NEON_A64V8_NATIVE) && (_MSC_VER < 1938) +#pragma message ("Due to msvc bug, current version of msvc is supported by workaround. Recommend to update msvc") +#undef vuqaddh_s16 +#define vuqaddh_s16(src1, src2) neon_suqadds16(__int16ToN16_v(src1), __uint16ToN16_v(src2)).n16_i16[0] +#undef vuqadds_s32 +#define vuqadds_s32(src1, src2) _CopyInt32FromFloat(neon_suqadds32(_CopyFloatFromInt32(src1), _CopyFloatFromUInt32(src2))) +#undef vuqaddd_s64 +#define vuqaddd_s64(src1, src2) neon_suqadds64(__int64ToN64_v(src1), __uint64ToN64_v(src2)).n64_i64[0] +#endif + SIMDE_FUNCTION_ATTRIBUTES int8_t simde_vuqaddb_s8(int8_t a, uint8_t b) { diff --git a/arm/neon/usdot.h b/arm/neon/usdot.h new file mode 100644 index 000000000..40adc65c1 --- /dev/null +++ b/arm/neon/usdot.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_USDOT_H) +#define SIMDE_ARM_NEON_USDOT_H + +#include "types.h" + +#include "add.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusdot_s32(r, a, b); + #else + simde_int32x2_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + return simde_vadd_s32(r, simde_int32x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdot_s32 + #define vusdot_s32(r, a, b) simde_vusdot_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusdotq_s32(r, a, b); + #else + simde_int32x4_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + for (int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdotq_s32 + #define vusdotq_s32(r, a, b) simde_vusdotq_s32((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_USDOT_H) */ diff --git a/arm/neon/usdot_lane.h b/arm/neon/usdot_lane.h new file mode 100644 index 000000000..512b685ce --- /dev/null +++ b/arm/neon/usdot_lane.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_USDOT_LANE_H) +#define SIMDE_ARM_NEON_USDOT_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_lane_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vusdot_lane_s32(r, a, b, lane) vusdot_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdot_lane_s32 + #define vusdot_lane_s32(r, a, b, lane) simde_vusdot_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_laneq_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vusdot_laneq_s32(r, a, b, lane) vusdot_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdot_laneq_s32 + #define vusdot_laneq_s32(r, a, b, lane) simde_vusdot_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_laneq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vusdotq_laneq_s32(r, a, b, lane) vusdotq_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdotq_laneq_s32 + #define vusdotq_laneq_s32(r, a, b, lane) simde_vusdotq_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_lane_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vusdotq_lane_s32(r, a, b, lane) vusdotq_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) + #undef vusdotq_lane_s32 + #define vusdotq_lane_s32(r, a, b, lane) simde_vusdotq_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_USDOT_LANE_H) */ diff --git a/arm/neon/uzp.h b/arm/neon/uzp.h index b44db4477..439dfe65c 100644 --- a/arm/neon/uzp.h +++ b/arm/neon/uzp.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,22 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vuzp_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzp_f16 + #define vuzp_f16(a, b) simde_vuzp_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vuzp_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +158,22 @@ simde_vuzp_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp_u32(a, b) simde_vuzp_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vuzpq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzpq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzpq_f16 + #define vuzpq_f16(a, b) simde_vuzpq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vuzpq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +279,66 @@ simde_vuzpq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vuzpq_u32(a, b) simde_vuzpq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vuzp_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzp_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vuzp1_p8(a, b), simde_vuzp2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzp_p8 + #define vuzp_p8(a, b) simde_vuzp_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vuzp_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzp_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vuzp1_p16(a, b), simde_vuzp2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzp_p16 + #define vuzp_p16(a, b) simde_vuzp_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vuzpq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzpq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vuzp1q_p8(a, b), simde_vuzp2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzpq_p8 + #define vuzpq_p8(a, b) simde_vuzpq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vuzpq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzpq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vuzp1q_p16(a, b), simde_vuzp2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzpq_p16 + #define vuzpq_p16(a, b) simde_vuzpq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/uzp1.h b/arm/neon/uzp1.h index 6cf65a782..0ef6b33cc 100644 --- a/arm/neon/uzp1.h +++ b/arm/neon/uzp1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP1_H) @@ -34,6 +35,34 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vuzp1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzp1_f16 + #define vuzp1_f16(a, b) simde_vuzp1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vuzp1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -272,6 +301,37 @@ simde_vuzp1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp1_u32(a, b) simde_vuzp1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vuzp1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp1q_f16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + float16x8x2_t t = vuzpq_f16(a, b); + return t.val[0]; + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzp1q_f16 + #define vuzp1q_f16(a, b) simde_vuzp1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vuzp1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -637,6 +697,153 @@ simde_vuzp1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vuzp1q_u64(a, b) simde_vuzp1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vuzp1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t t = vuzp_p8(a, b); + return t.val[0]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1_p8 + #define vuzp1_p8(a, b) simde_vuzp1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vuzp1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t t = vuzp_p16(a, b); + return t.val[0]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1_p16 + #define vuzp1_p16(a, b) simde_vuzp1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vuzp1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x16x2_t t = vuzpq_p8(a, b); + return t.val[0]; + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p8 + #define vuzp1q_p8(a, b) simde_vuzp1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vuzp1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x8x2_t t = vuzpq_p16(a, b); + return t.val[0]; + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p16 + #define vuzp1q_p16(a, b) simde_vuzp1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vuzp1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p64 + #define vuzp1q_p64(a, b) simde_vuzp1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/uzp2.h b/arm/neon/uzp2.h index 26856ab7e..7692a7d66 100644 --- a/arm/neon/uzp2.h +++ b/arm/neon/uzp2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP2_H) @@ -34,6 +35,34 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vuzp2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzp2_f16 + #define vuzp2_f16(a, b) simde_vuzp2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vuzp2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -272,6 +301,37 @@ simde_vuzp2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp2_u32(a, b) simde_vuzp2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vuzp2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp2q_f16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + float16x8x2_t t = vuzpq_f16(a, b); + return t.val[1]; + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vuzp2q_f16 + #define vuzp2q_f16(a, b) simde_vuzp2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vuzp2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -641,6 +701,153 @@ simde_vuzp2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vuzp2q_u64(a, b) simde_vuzp2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vuzp2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly8x8x2_t t = vuzp_p8(a, b); + return t.val[1]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2_p8 + #define vuzp2_p8(a, b) simde_vuzp2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vuzp2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly16x4x2_t t = vuzp_p16(a, b); + return t.val[1]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2_p16 + #define vuzp2_p16(a, b) simde_vuzp2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vuzp2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly8x16x2_t t = vuzpq_p8(a, b); + return t.val[1]; + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p8 + #define vuzp2q_p8(a, b) simde_vuzp2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vuzp2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly16x8x2_t t = vuzpq_p16(a, b); + return t.val[1]; + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p16 + #define vuzp2q_p16(a, b) simde_vuzp2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vuzp2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p64 + #define vuzp2q_p64(a, b) simde_vuzp2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/xar.h b/arm/neon/xar.h index d48db05ed..b7b2c5836 100644 --- a/arm/neon/xar.h +++ b/arm/neon/xar.h @@ -49,10 +49,10 @@ simde_vxarq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int d) return simde_uint64x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) #define simde_vxarq_u64(a, b, d) vxarq_u64((a), (b), (d)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vxarq_u64 #define vxarq_u64(a, b, d) simde_vxarq_u64((a), (b), (d)) #endif diff --git a/arm/neon/zip.h b/arm/neon/zip.h index 830a8d4db..d0a8d294c 100644 --- a/arm/neon/zip.h +++ b/arm/neon/zip.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,22 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vzip_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vzip1_f16(a, b), simde_vzip2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzip_f16 + #define vzip_f16(a, b) simde_vzip_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vzip_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +158,22 @@ simde_vzip_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip_u32(a, b) simde_vzip_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vzipq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzipq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vzip1q_f16(a, b), simde_vzip2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzipq_f16 + #define vzipq_f16(a, b) simde_vzipq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vzipq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +279,66 @@ simde_vzipq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vzipq_u32(a, b) simde_vzipq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vzip_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzip_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vzip1_p8(a, b), simde_vzip2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzip_p8 + #define vzip_p8(a, b) simde_vzip_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vzip_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzip_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vzip1_p16(a, b), simde_vzip2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzip_p16 + #define vzip_p16(a, b) simde_vzip_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vzipq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzipq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vzip1q_p8(a, b), simde_vzip2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzipq_p8 + #define vzipq_p8(a, b) simde_vzipq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vzipq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzipq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vzip1q_p16(a, b), simde_vzip2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzipq_p16 + #define vzipq_p16(a, b) simde_vzipq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/zip1.h b/arm/neon/zip1.h index b0298be4f..7eaae8bc1 100644 --- a/arm/neon/zip1.h +++ b/arm/neon/zip1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP1_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vzip1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzip1_f16 + #define vzip1_f16(a, b) simde_vzip1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vzip1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -279,6 +307,33 @@ simde_vzip1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip1_u32(a, b) simde_vzip1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vzip1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip1q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzip1q_f16 + #define vzip1q_f16(a, b) simde_vzip1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vzip1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -661,6 +716,148 @@ simde_vzip1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vzip1q_u64(a, b) simde_vzip1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vzip1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t tmp = vzip_p8(a, b); + return tmp.val[0]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1_p8 + #define vzip1_p8(a, b) simde_vzip1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vzip1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t tmp = vzip_p16(a, b); + return tmp.val[0]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1_p16 + #define vzip1_p16(a, b) simde_vzip1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vzip1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t tmp = vzip_p8(vget_low_p8(a), vget_low_p8(b)); + return vcombine_p8(tmp.val[0], tmp.val[1]); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p8 + #define vzip1q_p8(a, b) simde_vzip1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vzip1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t tmp = vzip_p16(vget_low_p16(a), vget_low_p16(b)); + return vcombine_p16(tmp.val[0], tmp.val[1]); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p16 + #define vzip1q_p16(a, b) simde_vzip1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vzip1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p64 + #define vzip1q_p64(a, b) simde_vzip1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/neon/zip2.h b/arm/neon/zip2.h index bf78b1201..1c195b83f 100644 --- a/arm/neon/zip2.h +++ b/arm/neon/zip2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP2_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vzip2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzip2_f16 + #define vzip2_f16(a, b) simde_vzip2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vzip2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -258,6 +286,33 @@ simde_vzip2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip2_u32(a, b) simde_vzip2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vzip2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip2q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) + #undef vzip2q_f16 + #define vzip2q_f16(a, b) simde_vzip2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vzip2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -619,6 +674,136 @@ simde_vzip2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vzip2q_u64(a, b) simde_vzip2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vzip2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2_p8 + #define vzip2_p8(a, b) simde_vzip2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vzip2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2_p16 + #define vzip2_p16(a, b) simde_vzip2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vzip2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p8 + #define vzip2q_p8(a, b) simde_vzip2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vzip2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p16 + #define vzip2q_p16(a, b) simde_vzip2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vzip2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p64 + #define vzip2q_p64(a, b) simde_vzip2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/arm/sve/and.h b/arm/sve/and.h index 76b37d20b..12d3f63bc 100644 --- a/arm/sve/and.h +++ b/arm/sve/and.h @@ -316,7 +316,8 @@ simde_svint32_t simde_svand_s32_z(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s32_z(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint32_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -340,7 +341,8 @@ simde_svint32_t simde_svand_s32_m(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s32_m(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint32_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -452,7 +454,8 @@ simde_svint64_t simde_svand_s64_z(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s64_z(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint64_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -476,7 +479,8 @@ simde_svint64_t simde_svand_s64_m(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s64_m(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint64_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/arm/sve/cmplt.h b/arm/sve/cmplt.h index fe400c4dd..5df0f8441 100644 --- a/arm/sve/cmplt.h +++ b/arm/sve/cmplt.h @@ -40,9 +40,11 @@ simde_svcmplt_s8(simde_svbool_t pg, simde_svint8_t op1, simde_svint8_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask64(_mm512_mask_cmplt_epi8_mask(simde_svbool_to_mmask64(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm256_mask_cmplt_epi8_mask(simde_svbool_to_mmask32(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i8 = vandq_s8(pg.neon_i8, vreinterpretq_s8_u8(vcltq_s8(op1.neon, op2.neon))); @@ -81,9 +83,11 @@ simde_svcmplt_s16(simde_svbool_t pg, simde_svint16_t op1, simde_svint16_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm512_mask_cmplt_epi16_mask(simde_svbool_to_mmask32(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm256_mask_cmplt_epi16_mask(simde_svbool_to_mmask16(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i16 = vandq_s16(pg.neon_i16, vreinterpretq_s16_u16(vcltq_s16(op1.neon, op2.neon))); @@ -122,9 +126,11 @@ simde_svcmplt_s32(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmplt_epi32_mask(simde_svbool_to_mmask16(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmplt_epi32_mask(simde_svbool_to_mmask8(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i32 = vandq_s32(pg.neon_i32, vreinterpretq_s32_u32(vcltq_s32(op1.neon, op2.neon))); @@ -163,9 +169,11 @@ simde_svcmplt_s64(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmplt_epi64_mask(simde_svbool_to_mmask8(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmplt_epi64_mask(simde_svbool_to_mmask4(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_i64 = vandq_s64(pg.neon_i64, vreinterpretq_s64_u64(vcltq_s64(op1.neon, op2.neon))); @@ -200,9 +208,11 @@ simde_svcmplt_u8(simde_svbool_t pg, simde_svuint8_t op1, simde_svuint8_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask64(_mm512_mask_cmplt_epu8_mask(simde_svbool_to_mmask64(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm256_mask_cmplt_epu8_mask(simde_svbool_to_mmask32(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u8 = vandq_u8(pg.neon_u8, vcltq_u8(op1.neon, op2.neon)); @@ -237,9 +247,11 @@ simde_svcmplt_u16(simde_svbool_t pg, simde_svuint16_t op1, simde_svuint16_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm512_mask_cmplt_epu16_mask(simde_svbool_to_mmask32(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm256_mask_cmplt_epu16_mask(simde_svbool_to_mmask16(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u16 = vandq_u16(pg.neon_u16, vcltq_u16(op1.neon, op2.neon)); @@ -274,9 +286,11 @@ simde_svcmplt_u32(simde_svbool_t pg, simde_svuint32_t op1, simde_svuint32_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmplt_epu32_mask(simde_svbool_to_mmask16(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmplt_epu32_mask(simde_svbool_to_mmask8(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u32 = vandq_u32(pg.neon_u32, vcltq_u32(op1.neon, op2.neon)); @@ -311,9 +325,11 @@ simde_svcmplt_u64(simde_svbool_t pg, simde_svuint64_t op1, simde_svuint64_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmplt_epu64_mask(simde_svbool_to_mmask8(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmplt_epu64_mask(simde_svbool_to_mmask4(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_u64 = vandq_u64(pg.neon_u64, vcltq_u64(op1.neon, op2.neon)); @@ -348,9 +364,11 @@ simde_svcmplt_f32(simde_svbool_t pg, simde_svfloat32_t op1, simde_svfloat32_t op #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmp_ps_mask(simde_svbool_to_mmask16(pg), op1.m512, op2.m512, _CMP_LT_OQ)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmp_ps_mask(simde_svbool_to_mmask8(pg), op1.m256[0], op2.m256[0], _CMP_LT_OQ)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u32 = vandq_u32(pg.neon_u32, vcltq_f32(op1.neon, op2.neon)); @@ -389,9 +407,11 @@ simde_svcmplt_f64(simde_svbool_t pg, simde_svfloat64_t op1, simde_svfloat64_t op #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmp_pd_mask(simde_svbool_to_mmask8(pg), op1.m512d, op2.m512d, _CMP_LT_OQ)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmp_pd_mask(simde_svbool_to_mmask4(pg), op1.m256d[0], op2.m256d[0], _CMP_LT_OQ)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_u64 = vandq_u64(pg.neon_u64, vcltq_f64(op1.neon, op2.neon)); diff --git a/arm/sve/ld1.h b/arm/sve/ld1.h index 607c3be40..8008ad60a 100644 --- a/arm/sve/ld1.h +++ b/arm/sve/ld1.h @@ -51,9 +51,11 @@ simde_svld1_s8(simde_svbool_t pg, const int8_t * base) { #else simde_svint8_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi8(simde_svbool_to_mmask64(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi8(simde_svbool_to_mmask32(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -77,9 +79,11 @@ simde_svld1_s16(simde_svbool_t pg, const int16_t * base) { #else simde_svint16_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi16(simde_svbool_to_mmask32(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi16(simde_svbool_to_mmask16(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { @@ -103,9 +107,11 @@ simde_svld1_s32(simde_svbool_t pg, const int32_t * base) { #else simde_svint32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi32(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi32(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -129,9 +135,11 @@ simde_svld1_s64(simde_svbool_t pg, const int64_t * base) { #else simde_svint64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi64(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi64(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { @@ -155,9 +163,11 @@ simde_svld1_u8(simde_svbool_t pg, const uint8_t * base) { #else simde_svuint8_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi8(simde_svbool_to_mmask64(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi8(simde_svbool_to_mmask32(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -181,9 +191,11 @@ simde_svld1_u16(simde_svbool_t pg, const uint16_t * base) { #else simde_svuint16_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi16(simde_svbool_to_mmask32(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi16(simde_svbool_to_mmask16(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { @@ -207,9 +219,11 @@ simde_svld1_u32(simde_svbool_t pg, const uint32_t * base) { #else simde_svuint32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi32(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi32(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -233,9 +247,11 @@ simde_svld1_u64(simde_svbool_t pg, const uint64_t * base) { #else simde_svuint64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi64(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi64(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { @@ -259,9 +275,11 @@ simde_svld1_f32(simde_svbool_t pg, const simde_float32 * base) { #else simde_svfloat32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512 = _mm512_maskz_loadu_ps(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256[0] = _mm256_maskz_loadu_ps(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -285,9 +303,11 @@ simde_svld1_f64(simde_svbool_t pg, const simde_float64 * base) { #else simde_svfloat64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512d = _mm512_maskz_loadu_pd(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256d[0] = _mm256_maskz_loadu_pd(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { diff --git a/arm/sve/ptest.h b/arm/sve/ptest.h index 5e6adb8b4..304633115 100644 --- a/arm/sve/ptest.h +++ b/arm/sve/ptest.h @@ -37,7 +37,7 @@ simde_bool simde_svptest_first(simde_svbool_t pg, simde_svbool_t op) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptest_first(pg, op); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_LIKELY(pg.value & 1)) return op.value & 1; diff --git a/arm/sve/ptrue.h b/arm/sve/ptrue.h index b894b1e01..064b96ace 100644 --- a/arm/sve/ptrue.h +++ b/arm/sve/ptrue.h @@ -37,7 +37,7 @@ simde_svbool_t simde_svptrue_b8(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b8(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -67,7 +67,7 @@ simde_svbool_t simde_svptrue_b16(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b16(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -97,7 +97,7 @@ simde_svbool_t simde_svptrue_b32(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b32(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -127,7 +127,7 @@ simde_svbool_t simde_svptrue_b64(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b64(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/arm/sve/sel.h b/arm/sve/sel.h index eb9b9f3cc..a5e79b567 100644 --- a/arm/sve/sel.h +++ b/arm/sve/sel.h @@ -43,9 +43,11 @@ simde_x_svsel_s8_z(simde_svbool_t pg, simde_svint8_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s8(pg.neon_i8, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi8(simde_svbool_to_mmask64(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi8(simde_svbool_to_mmask32(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -84,9 +86,11 @@ simde_svsel_s8(simde_svbool_t pg, simde_svint8_t op1, simde_svint8_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s8(pg.neon_u8, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi8(op2.m512i, simde_svbool_to_mmask64(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi8(op2.m256i[0], simde_svbool_to_mmask32(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -131,9 +135,11 @@ simde_x_svsel_s16_z(simde_svbool_t pg, simde_svint16_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s16(pg.neon_i16, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi16(simde_svbool_to_mmask32(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi16(simde_svbool_to_mmask16(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -172,9 +178,11 @@ simde_svsel_s16(simde_svbool_t pg, simde_svint16_t op1, simde_svint16_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s16(pg.neon_u16, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi16(op2.m512i, simde_svbool_to_mmask32(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi16(op2.m256i[0], simde_svbool_to_mmask16(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -219,9 +227,11 @@ simde_x_svsel_s32_z(simde_svbool_t pg, simde_svint32_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s32(pg.neon_i32, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi32(simde_svbool_to_mmask16(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi32(simde_svbool_to_mmask8(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -260,9 +270,11 @@ simde_svsel_s32(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s32(pg.neon_u32, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi32(op2.m512i, simde_svbool_to_mmask16(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi32(op2.m256i[0], simde_svbool_to_mmask8(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -307,9 +319,11 @@ simde_x_svsel_s64_z(simde_svbool_t pg, simde_svint64_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s64(pg.neon_i64, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi64(simde_svbool_to_mmask8(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi64(simde_svbool_to_mmask4(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -348,9 +362,11 @@ simde_svsel_s64(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s64(pg.neon_u64, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi64(op2.m512i, simde_svbool_to_mmask8(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi64(op2.m256i[0], simde_svbool_to_mmask4(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -390,7 +406,8 @@ simde_svuint8_t simde_x_svsel_u8_z(simde_svbool_t pg, simde_svuint8_t op1) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_u8_z(pg, op1, op1); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svuint8_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -410,7 +427,8 @@ simde_svuint8_t simde_svsel_u8(simde_svbool_t pg, simde_svuint8_t op1, simde_svuint8_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svsel_u8(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svuint8_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/arm/sve/st1.h b/arm/sve/st1.h index 39f5c4c79..e3c6230d8 100644 --- a/arm/sve/st1.h +++ b/arm/sve/st1.h @@ -37,9 +37,11 @@ void simde_svst1_s8(simde_svbool_t pg, int8_t * base, simde_svint8_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s8(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -59,10 +61,12 @@ void simde_svst1_s16(simde_svbool_t pg, int16_t * base, simde_svint16_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s16(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { if (pg.values_i16[i]) { @@ -81,10 +85,12 @@ void simde_svst1_s32(simde_svbool_t pg, int32_t * base, simde_svint32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_i32[i]) { @@ -103,10 +109,12 @@ void simde_svst1_s64(simde_svbool_t pg, int64_t * base, simde_svint64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_i64[i]) { @@ -125,10 +133,12 @@ void simde_svst1_u8(simde_svbool_t pg, uint8_t * base, simde_svuint8_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u8(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { if (pg.values_u8[i]) { @@ -147,10 +157,12 @@ void simde_svst1_u16(simde_svbool_t pg, uint16_t * base, simde_svuint16_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u16(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { if (pg.values_u16[i]) { @@ -169,10 +181,12 @@ void simde_svst1_u32(simde_svbool_t pg, uint32_t * base, simde_svuint32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_u32[i]) { @@ -191,10 +205,12 @@ void simde_svst1_u64(simde_svbool_t pg, uint64_t * base, simde_svuint64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_u64[i]) { @@ -213,10 +229,12 @@ void simde_svst1_f32(simde_svbool_t pg, simde_float32 * base, simde_svfloat32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_f32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_ps(base, simde_svbool_to_mmask16(pg), data.m512); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_ps(base, simde_svbool_to_mmask8(pg), data.m256[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_ps(base, simde_svbool_to_mmask16(pg), data.m512); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_ps(base, simde_svbool_to_mmask8(pg), data.m256[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_i32[i]) { @@ -235,10 +253,12 @@ void simde_svst1_f64(simde_svbool_t pg, simde_float64 * base, simde_svfloat64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_f64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_pd(base, simde_svbool_to_mmask8(pg), data.m512d); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_pd(base, simde_svbool_to_mmask4(pg), data.m256d[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_pd(base, simde_svbool_to_mmask8(pg), data.m512d); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_pd(base, simde_svbool_to_mmask4(pg), data.m256d[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_i64[i]) { diff --git a/arm/sve/types.h b/arm/sve/types.h index ae7cbb95e..f0579d96c 100644 --- a/arm/sve/types.h +++ b/arm/sve/types.h @@ -396,7 +396,7 @@ SIMDE_BEGIN_DECLS_ #endif } simde_svfloat64_t; - #if defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) typedef struct { __mmask64 value; int type; diff --git a/arm/sve/whilelt.h b/arm/sve/whilelt.h index 44e024f01..f0e0bd2cd 100644 --- a/arm/sve/whilelt.h +++ b/arm/sve/whilelt.h @@ -37,7 +37,8 @@ simde_svbool_t simde_svwhilelt_b8_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -48,7 +49,8 @@ simde_svwhilelt_b8_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -82,7 +84,8 @@ simde_svbool_t simde_svwhilelt_b16_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -93,7 +96,8 @@ simde_svwhilelt_b16_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -127,7 +131,8 @@ simde_svbool_t simde_svwhilelt_b32_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -138,7 +143,8 @@ simde_svwhilelt_b32_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -172,7 +178,8 @@ simde_svbool_t simde_svwhilelt_b64_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -183,7 +190,8 @@ simde_svwhilelt_b64_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -217,7 +225,8 @@ simde_svbool_t simde_svwhilelt_b8_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -228,7 +237,8 @@ simde_svwhilelt_b8_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -262,18 +272,20 @@ simde_svbool_t simde_svwhilelt_b16_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); int_fast64_t remaining = (HEDLEY_STATIC_CAST(int_fast64_t, op2) - HEDLEY_STATIC_CAST(int_fast64_t, op1)); - __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT64_C(0)); + __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT32_C(0)); if (HEDLEY_UNLIKELY(remaining < 32)) { r >>= 32 - remaining; } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -307,7 +319,8 @@ simde_svbool_t simde_svwhilelt_b32_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -318,7 +331,8 @@ simde_svwhilelt_b32_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -352,7 +366,8 @@ simde_svbool_t simde_svwhilelt_b64_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -363,7 +378,8 @@ simde_svwhilelt_b64_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -397,7 +413,8 @@ simde_svbool_t simde_svwhilelt_b8_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -408,7 +425,8 @@ simde_svwhilelt_b8_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -442,7 +460,8 @@ simde_svbool_t simde_svwhilelt_b16_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -453,7 +472,8 @@ simde_svwhilelt_b16_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -487,7 +507,8 @@ simde_svbool_t simde_svwhilelt_b32_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -498,7 +519,8 @@ simde_svwhilelt_b32_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -532,7 +554,8 @@ simde_svbool_t simde_svwhilelt_b64_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -543,7 +566,8 @@ simde_svwhilelt_b64_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -577,7 +601,8 @@ simde_svbool_t simde_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -588,7 +613,8 @@ simde_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -622,18 +648,20 @@ simde_svbool_t simde_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); uint_fast64_t remaining = (HEDLEY_STATIC_CAST(uint_fast64_t, op2) - HEDLEY_STATIC_CAST(uint_fast64_t, op1)); - __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT64_C(0)); + __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT32_C(0)); if (HEDLEY_UNLIKELY(remaining < 32)) { r >>= 32 - remaining; } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -667,7 +695,8 @@ simde_svbool_t simde_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -678,7 +707,8 @@ simde_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -712,7 +742,8 @@ simde_svbool_t simde_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -723,7 +754,8 @@ simde_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); diff --git a/check.h b/check.h index 8fd913eb8..7d17d2925 100644 --- a/check.h +++ b/check.h @@ -1,5 +1,5 @@ /* Check (assertions) - * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Portable Snippets - https://github.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all diff --git a/debug-trap.h b/debug-trap.h index 11da805d5..2d3c60f84 100644 --- a/debug-trap.h +++ b/debug-trap.h @@ -1,5 +1,5 @@ /* Debugging assertions and traps - * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Portable Snippets - https://github.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all diff --git a/hedley.h b/hedley.h index 41ac30221..f064f3f4c 100644 --- a/hedley.h +++ b/hedley.h @@ -184,6 +184,7 @@ # undef HEDLEY_EMSCRIPTEN_VERSION #endif #if defined(__EMSCRIPTEN__) +# include # define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) #endif diff --git a/mips/msa/adds.h b/mips/msa/adds.h index e610d482a..2b7efc58f 100644 --- a/mips/msa/adds.h +++ b/mips/msa/adds.h @@ -356,8 +356,8 @@ simde_msa_adds_u_w(simde_v4u32 a, simde_v4u32 b) { r_; #if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(__AVX512VL__) - __m128i notb = _mm_ternarylogic_epi32(b, b, b, 0x0f); + #if defined(SIMDE_ARCH_X86_AVX512VL) + __m128i notb = _mm_ternarylogic_epi32(b_.m128i, b_.m128i, b_.m128i, 0x0f); #else __m128i notb = _mm_xor_si128(b_.m128i, _mm_set1_epi32(~INT32_C(0))); #endif diff --git a/mips/msa/ld.h b/mips/msa/ld.h index 9f17dbfb8..62662e6b6 100644 --- a/mips/msa/ld.h +++ b/mips/msa/ld.h @@ -37,16 +37,15 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v16i8 simde_msa_ld_b(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_b(rs, s10); - #else - simde_v16i8 r; + simde_v16i8 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_b(rs, s10) __msa_ld_b((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_b #define __msa_ld_b(rs, s10) simde_msa_ld_b((rs), (s10)) @@ -57,16 +56,15 @@ simde_v8i16 simde_msa_ld_h(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_h(rs, s10); - #else - simde_v8i16 r; + simde_v8i16 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_h(rs, s10) __msa_ld_h((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_h #define __msa_ld_h(rs, s10) simde_msa_ld_h((rs), (s10)) @@ -77,16 +75,15 @@ simde_v4i32 simde_msa_ld_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_w(rs, s10); - #else - simde_v4i32 r; + simde_v4i32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_w(rs, s10) __msa_ld_w((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_w #define __msa_ld_w(rs, s10) simde_msa_ld_w((rs), (s10)) @@ -97,16 +94,15 @@ simde_v2i64 simde_msa_ld_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_d(rs, s10); - #else - simde_v2i64 r; + simde_v2i64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_d(rs, s10) __msa_ld_d((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_d #define __msa_ld_d(rs, s10) simde_msa_ld_d((rs), (s10)) @@ -116,96 +112,90 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v16u8 simde_x_msa_ld_u_b(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v16u8, __msa_ld_b(rs, s10)); - #else - simde_v16u8 r; + simde_v16u8 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_b(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v16u8, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v8u16 simde_x_msa_ld_u_h(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v8u16, __msa_ld_b(rs, s10)); - #else - simde_v8u16 r; + simde_v8u16 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_h(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v8u16, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v4u32 simde_x_msa_ld_u_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v4u32, __msa_ld_b(rs, s10)); - #else - simde_v4u32 r; + simde_v4u32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_w(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v4u32, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v2u64 simde_x_msa_ld_u_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v2u64, __msa_ld_b(rs, s10)); - #else - simde_v2u64 r; + simde_v2u64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_d(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v2u64, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v4f32 simde_x_msa_fld_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v4f32, __msa_ld_b(rs, s10)); - #else - simde_v4f32 r; + simde_v4f32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_fld_w(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v4f32, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v2f64 simde_x_msa_fld_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v2f64, __msa_ld_b(rs, s10)); - #else - simde_v2f64 r; + simde_v2f64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_fld_d(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v2f64, __msa_ld_b((rs), (s10))) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/mips/msa/madd.h b/mips/msa/madd.h index 5037577a4..61cf18e87 100644 --- a/mips/msa/madd.h +++ b/mips/msa/madd.h @@ -38,7 +38,7 @@ simde_v4f32 simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { #if defined(SIMDE_MIPS_MSA_NATIVE) return __msa_fmadd_w(a, b, c); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, c, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_f32(a, b, c); @@ -56,7 +56,7 @@ simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { #elif defined(SIMDE_X86_SSE_NATIVE) r_.m128 = _mm_add_ps(a_.m128, _mm_mul_ps(b_.m128, c_.m128)); #elif defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fma(a_.v128, b_.v128, c_.v128); + r_.v128 = wasm_f32x4_relaxed_madd(b_.v128, c_.v128, a_.v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_add(a_.v128, wasm_f32x4_mul(b_.v128, c_.v128)); #elif defined(SIMDE_VECTOR_SUBSCRIPT) @@ -73,7 +73,7 @@ simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { } #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_fmadd_w - #define __msa_fmadd_w(a, b) simde_msa_fmadd_w((a), (b)) + #define __msa_fmadd_w(a, b, c) simde_msa_fmadd_w((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -97,7 +97,7 @@ simde_msa_fmadd_d(simde_v2f64 a, simde_v2f64 b, simde_v2f64 c) { #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128d = _mm_add_pd(a_.m128d, _mm_mul_pd(b_.m128d, c_.m128d)); #elif defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - r_.v128 = wasm_f64x2_fma(a_.v128, b_.v128, c_.v128); + r_.v128 = wasm_f64x2_relaxed_madd(b_.v128, c_.v128, a_.v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_add(a_.v128, wasm_f64x2_mul(b_.v128, c_.v128)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -114,7 +114,7 @@ simde_msa_fmadd_d(simde_v2f64 a, simde_v2f64 b, simde_v2f64 c) { } #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_fmadd_d - #define __msa_fmadd_d(a, b) simde_msa_fmadd_d((a), (b)) + #define __msa_fmadd_d(a, b, c) simde_msa_fmadd_d((a), (b), (c)) #endif SIMDE_END_DECLS_ diff --git a/mips/msa/st.h b/mips/msa/st.h index 2c5b28833..c41c832cc 100644 --- a/mips/msa/st.h +++ b/mips/msa/st.h @@ -37,12 +37,11 @@ SIMDE_FUNCTION_ATTRIBUTES void simde_msa_st_b(simde_v16i8 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_b(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_b(a, rs, s10) __msa_st_b((a), (rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_b #define __msa_st_b(a, rs, s10) simde_msa_st_b((a), (rs), (s10)) @@ -53,12 +52,11 @@ void simde_msa_st_h(simde_v8i16 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_h(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_h(a, rs, s10) __msa_st_h((a), (rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_h #define __msa_st_h(a, rs, s10) simde_msa_st_h((a), (rs), (s10)) @@ -69,12 +67,11 @@ void simde_msa_st_w(simde_v4i32 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_w(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_w(a, rs, s10) __msa_st_w((a), (rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_w #define __msa_st_w(a, rs, s10) simde_msa_st_w((a), (rs), (s10)) @@ -85,12 +82,11 @@ void simde_msa_st_d(simde_v2i64 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_d(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_d(a, rs, s10) __msa_st_d((a), (rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_d #define __msa_st_d(a, rs, s10) simde_msa_st_d((a), (rs), (s10)) diff --git a/mips/msa/types.h b/mips/msa/types.h index b10880c65..93536bc48 100644 --- a/mips/msa/types.h +++ b/mips/msa/types.h @@ -49,7 +49,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int8x16_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -67,7 +67,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -85,7 +85,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -103,7 +103,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -121,8 +121,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int8x16_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -139,8 +139,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int16x8_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint16x8_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -157,8 +157,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int32x4_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -175,8 +175,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int64x2_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -193,8 +193,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128 m128; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int32x4_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -211,8 +211,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128d m128d; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int64x2_t neon; + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + float64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -230,7 +230,7 @@ typedef union { typedef v2u64 simde_v2u64; typedef v4f32 simde_v4f32; typedef v2f64 simde_v2f64; -#elif defined(SIMDE_MIPS_MSA_A32V7_NATIVE) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) typedef int8x16_t simde_v16i8; typedef int16x8_t simde_v8i16; typedef int32x4_t simde_v4i32; @@ -240,7 +240,7 @@ typedef union { typedef uint32x4_t simde_v4u32; typedef uint64x2_t simde_v2u64; typedef float32x4_t simde_v4f32; - #if defined(SIMDE_MIPS_MSA_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) typedef float64x2_t simde_v2f64; #elif defined(SIMDE_VECTOR) typedef double simde_v2f64 __attribute__((__vector_size__(16))); diff --git a/simde-aes.h b/simde-aes.h new file mode 100644 index 000000000..ee1ad7b7e --- /dev/null +++ b/simde-aes.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_AES_H) +#define SIMDE_AES_H + +#include "simde-features.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS + +#if !(defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +#if HEDLEY_GCC_VERSION_CHECK(13,2,0) +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") +#endif + +/* + * Number of columns (32-bit words) comprising the State. For this + * standard, Nb = 4. + */ +#define simde_x_aes_Nb 4 + +static uint8_t simde_x_aes_gmult_lookup_table[8][256] = { +{ // gmult(0x02, b); + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, + 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, + 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, + 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, + 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, + 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, + 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, + 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, + 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, + 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, + 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, + 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, + 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, + 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, + 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 +}, +{ // gmult(0x01, b); + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, +}, +{ // gmult(0x01, b); + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, +}, +{ // gmult(0x03, b); + 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, + 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, + 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, + 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, + 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, + 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, + 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, + 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, + 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, + 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba, + 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, + 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, + 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a, + 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, + 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, + 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a, +}, +{ // gmult(0x0e, b); + 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, + 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, + 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, + 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, + 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, + 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, + 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, + 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, + 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, + 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, + 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, + 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, + 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, + 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, + 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, + 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, +}, +{ // gmult(0x09, b); + 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, + 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, + 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, + 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, + 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, + 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, + 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, + 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, + 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, + 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, + 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, + 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, + 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, + 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, + 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, + 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46, + +}, +{ // gmult(0x0d, b); + 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, + 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, + 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, + 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, + 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, + 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, + 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, + 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, + 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, + 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, + 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, + 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, + 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, + 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, + 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, + 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97, +}, +{ // gmult(0x0b, b); + 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, + 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, + 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, + 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, + 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, + 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, + 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, + 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, + 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, + 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, + 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, + 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, + 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, + 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, + 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, + 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3, +} +}; + +/* + * S-box transformation table + */ +static uint8_t simde_x_aes_s_box[256] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, // 0 + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, // 1 + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, // 2 + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, // 3 + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, // 4 + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, // 5 + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, // 6 + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, // 7 + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, // 8 + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, // 9 + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, // a + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, // b + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, // c + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, // d + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, // e + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};// f + +/* + * Inverse S-box transformation table + */ +static uint8_t simde_x_aes_inv_s_box[256] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, // 0 + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, // 1 + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, // 2 + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, // 3 + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, // 4 + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, // 5 + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, // 6 + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, // 7 + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, // 8 + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, // 9 + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, // a + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, // b + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, // c + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, // d + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, // e + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};// f + +/* + * Multiplication of 4 byte words + * m(x) = x4+1 + +SIMDE_FUNCTION_ATTRIBUTES +void coef_mult(uint8_t *a, uint8_t *b, uint8_t *d) { + + d[0] = gmult(a[0],b[0])^gmult(a[3],b[1])^gmult(a[2],b[2])^gmult(a[1],b[3]); + d[1] = gmult(a[1],b[0])^gmult(a[0],b[1])^gmult(a[3],b[2])^gmult(a[2],b[3]); + d[2] = gmult(a[2],b[0])^gmult(a[1],b[1])^gmult(a[0],b[2])^gmult(a[3],b[3]); + d[3] = gmult(a[3],b[0])^gmult(a[2],b[1])^gmult(a[1],b[2])^gmult(a[0],b[3]); +} +*/ + +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_coef_mult_lookup(int lookup_table_offset, uint8_t *b, uint8_t *d) { + int o = lookup_table_offset; + + #define gmultl(o,b) simde_x_aes_gmult_lookup_table[o][b] + d[0] = gmultl(o+0,b[0])^gmultl(o+3,b[1])^gmultl(o+2,b[2])^gmultl(o+1,b[3]); + d[1] = gmultl(o+1,b[0])^gmultl(o+0,b[1])^gmultl(o+3,b[2])^gmultl(o+2,b[3]); + d[2] = gmultl(o+2,b[0])^gmultl(o+1,b[1])^gmultl(o+0,b[2])^gmultl(o+3,b[3]); + d[3] = gmultl(o+3,b[0])^gmultl(o+2,b[1])^gmultl(o+1,b[2])^gmultl(o+0,b[3]); + #undef gmultl +} + +#endif + +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_AES_H) */ diff --git a/simde-align.h b/simde-align.h index 0c8a809ee..2cd49e75a 100644 --- a/simde-align.h +++ b/simde-align.h @@ -11,7 +11,7 @@ ********************************************************************** * * This is portability layer which should help iron out some - * differences across various compilers, as well as various verisons of + * differences across various compilers, as well as various versions of * C and C++. * * It was originally developed for SIMD Everywhere @@ -55,7 +55,7 @@ #include "hedley.h" /* I know this seems a little silly, but some non-hosted compilers - * don't have stddef.h, so we try to accomodate them. */ + * don't have stddef.h, so we try to accommodate them. */ #if !defined(SIMDE_ALIGN_SIZE_T_) #if defined(__SIZE_TYPE__) #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ @@ -405,7 +405,7 @@ /* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) * - * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a + * This is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a * type instead of a numeric value. */ #if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO) #define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type)) diff --git a/simde-arch.h b/simde-arch.h index 2d09ff772..c9eaa8152 100644 --- a/simde-arch.h +++ b/simde-arch.h @@ -42,6 +42,8 @@ #if !defined(SIMDE_ARCH_H) #define SIMDE_ARCH_H +#include "hedley.h" + /* Alpha */ #if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) @@ -119,9 +121,54 @@ # define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM # endif #endif -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_AES) && __ARM_FEATURE_AES +# define SIMDE_ARCH_ARM_AES +#endif +#if defined(__ARM_FEATURE_COMPLEX) && __ARM_FEATURE_COMPLEX +# define SIMDE_ARCH_ARM_COMPLEX +#endif +#if defined(__ARM_FEATURE_CRYPTO) && __ARM_FEATURE_CRYPTO +# define SIMDE_ARCH_ARM_CRYPTO +#endif +#if defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32 +# define SIMDE_ARCH_ARM_CRC32 +#endif +#if defined(__ARM_FEATURE_DOTPROD) && __ARM_FEATURE_DOTPROD +# define SIMDE_ARCH_ARM_DOTPROD +#endif +#if defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA +# define SIMDE_ARCH_ARM_FMA +#endif +#if defined(__ARM_FEATURE_FP16_FML) && __ARM_FEATURE_FP16_FML +# define SIMDE_ARCH_ARM_FP16_FML +#endif +#if defined(__ARM_FEATURE_FRINT) && __ARM_FEATURE_FRINT +# define SIMDE_ARCH_ARM_FRINT +#endif +#if defined(__ARM_FEATURE_MATMUL_INT8) && __ARM_FEATURE_MATMUL_INT8 +# define SIMDE_ARCH_ARM_MATMUL_INT8 +#endif +#if defined(__ARM_FEATURE_SHA2) && __ARM_FEATURE_SHA2 && !defined(__APPLE_CC__) +# define SIMDE_ARCH_ARM_SHA2 +#endif +#if defined(__ARM_FEATURE_SHA3) && __ARM_FEATURE_SHA3 +# define SIMDE_ARCH_ARM_SHA3 +#endif +#if defined(__ARM_FEATURE_SHA512) && __ARM_FEATURE_SHA512 +# define SIMDE_ARCH_ARM_SHA512 +#endif +#if defined(__ARM_FEATURE_SM3) && __ARM_FEATURE_SM3 +# define SIMDE_ARCH_ARM_SM3 +#endif +#if defined(__ARM_FEATURE_SM4) && __ARM_FEATURE_SM4 +# define SIMDE_ARCH_ARM_SM4 +#endif +#if defined(__ARM_FEATURE_SVE) && __ARM_FEATURE_SVE # define SIMDE_ARCH_ARM_SVE #endif +#if defined(__ARM_FEATURE_QRDMX) && __ARM_FEATURE_QRDMX +# define SIMDE_ARCH_ARM_QRDMX +#endif /* Blackfin */ @@ -267,12 +314,15 @@ # if !defined(SIMDE_ARCH_X86_SSE4_1) # define SIMDE_ARCH_X86_SSE4_1 1 # endif -# if !defined(SIMDE_ARCH_X86_SSE4_1) +# if !defined(SIMDE_ARCH_X86_SSE4_2) # define SIMDE_ARCH_X86_SSE4_2 1 # endif # endif # if defined(__AVX2__) # define SIMDE_ARCH_X86_AVX2 1 +# if defined(_MSC_VER) +# define SIMDE_ARCH_X86_FMA 1 +# endif # endif # if defined(__FMA__) # define SIMDE_ARCH_X86_FMA 1 @@ -319,6 +369,9 @@ # if defined(__AVX512VL__) # define SIMDE_ARCH_X86_AVX512VL 1 # endif +# if defined(__AVX512FP16__) +# define SIMDE_ARCH_X86_AVX512FP16 1 +# endif # if defined(__GFNI__) # define SIMDE_ARCH_X86_GFNI 1 # endif @@ -328,9 +381,12 @@ # if defined(__VPCLMULQDQ__) # define SIMDE_ARCH_X86_VPCLMULQDQ 1 # endif -# if defined(__F16C__) +# if defined(__F16C__) || (defined(HEDLEY_MSVC_VERSION) && HEDLEY_MSVC_VERSION_CHECK(19,30,0) && defined(SIMDE_ARCH_X86_AVX2) ) # define SIMDE_ARCH_X86_F16C 1 # endif +# if defined(__AES__) +# define SIMDE_ARCH_X86_AES 1 +# endif #endif /* Itanium @@ -459,6 +515,45 @@ #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) #endif +/* RISC-V + */ +#if defined(__riscv) || defined(__riscv__) +# if __riscv_xlen == 64 +# define SIMDE_ARCH_RISCV64 +# elif __riscv_xlen == 32 +# define SIMDE_ARCH_RISCV32 +# endif +#endif + +/* RISC-V SIMD ISA extensions */ +#if defined(__riscv_zve32x) +# define SIMDE_ARCH_RISCV_ZVE32X 1 +#endif +#if defined(__riscv_zve32f) +# define SIMDE_ARCH_RISCV_ZVE32F 1 +#endif +#if defined(__riscv_zve64x) +# define SIMDE_ARCH_RISCV_ZVE64X 1 +#endif +#if defined(__riscv_zve64f) +# define SIMDE_ARCH_RISCV_ZVE64F 1 +#endif +#if defined(__riscv_zve64d) +# define SIMDE_ARCH_RISCV_ZVE64D 1 +#endif +#if defined(__riscv_v) || (defined(__riscv_zve64d) && defined(__riscv_zvl128b)) +# define SIMDE_ARCH_RISCV_V 1 +#endif +#if defined(__riscv_zvfh) +# define SIMDE_ARCH_RISCV_ZVFH 1 +#endif +#if defined(__riscv_zvfhmin) +# define SIMDE_ARCH_RISCV_ZVFHMIN 1 +#endif +#if defined(__riscv_zvlsseg) || defined(__riscv_v) +# define SIMDE_ARCH_RISCV_ZVLSSEG 1 +#endif + /* SPARC */ #if defined(__sparc_v9__) || defined(__sparcv9) @@ -557,6 +652,10 @@ # define SIMDE_ARCH_WASM_SIMD128 #endif +#if defined(SIMDE_ARCH_WASM) && defined(__wasm_relaxed_simd__) +# define SIMDE_ARCH_WASM_RELAXED_SIMD +#endif + /* Xtensa */ #if defined(__xtensa__) || defined(__XTENSA__) @@ -568,4 +667,27 @@ # define SIMDE_ARCH_ARM_NEON_FP16 #endif +/* Availability of 16-bit brain floating-point arithmetic intrinsics */ +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) +# define SIMDE_ARCH_ARM_NEON_BF16 +#endif + +/* LoongArch + */ +#if defined(__loongarch32) +# define SIMDE_ARCH_LOONGARCH 1 +#elif defined(__loongarch64) +# define SIMDE_ARCH_LOONGARCH 2 +#endif + +/* LSX: LoongArch 128-bits SIMD extension */ +#if defined(__loongarch_sx) +# define SIMDE_ARCH_LOONGARCH_LSX 1 +#endif + +/* LASX: LoongArch 256-bits SIMD extension */ +#if defined(__loongarch_asx) +# define SIMDE_ARCH_LOONGARCH_LASX 2 +#endif + #endif /* !defined(SIMDE_ARCH_H) */ diff --git a/simde-bf16.h b/simde-bf16.h new file mode 100644 index 000000000..7e0736854 --- /dev/null +++ b/simde-bf16.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#include "hedley.h" +#include "simde-common.h" +#include "simde-detect-clang.h" + +#if !defined(SIMDE_BFLOAT16_H) +#define SIMDE_BFLOAT16_H + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +/* This implementations is based upon simde-f16.h */ + +/* Portable version which should work on pretty much any compiler. + * Obviously you can't rely on compiler support for things like + * conversion to/from 32-bit floats, so make sure you always use the + * functions and macros in this file! + */ +#define SIMDE_BFLOAT16_API_PORTABLE 1 + +#define SIMDE_BFLOAT16_API_BF16 2 + +#if !defined(SIMDE_BFLOAT16_API) + #if defined(SIMDE_ARM_NEON_BF16) + #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_BF16 + #else + #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_PORTABLE + #endif +#endif + +#if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + #include + typedef __bf16 simde_bfloat16; +#elif SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_PORTABLE + typedef struct { uint16_t value; } simde_bfloat16; +#else + #error No 16-bit floating point API. +#endif + +/* Conversion -- convert between single-precision and brain half-precision + * floats. */ +static HEDLEY_ALWAYS_INLINE HEDLEY_CONST +simde_bfloat16 +simde_bfloat16_from_float32 (simde_float32 value) { +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvth_bf16_f32(value); +#else + simde_bfloat16 res; + char* src = HEDLEY_REINTERPRET_CAST(char*, &value); + // rounding to nearest bfloat16 + // If the 17th bit of value is 1, set the rounding to 1. + uint8_t rounding = 0; + + #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE + if (src[1] & UINT8_C(0x80)) rounding = 1; + src[2] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[2]) + rounding)); + simde_memcpy(&res, src+2, sizeof(res)); + #else + if (src[2] & UINT8_C(0x80)) rounding = 1; + src[1] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[1]) + rounding)); + simde_memcpy(&res, src, sizeof(res)); + #endif + + return res; +#endif +} + +static HEDLEY_ALWAYS_INLINE HEDLEY_CONST +simde_float32 +simde_bfloat16_to_float32 (simde_bfloat16 value) { +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtah_f32_bf16(value); +#else + simde_float32 res = 0.0; + char* _res = HEDLEY_REINTERPRET_CAST(char*, &res); + + #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE + simde_memcpy(_res+2, &value, sizeof(value)); + #else + simde_memcpy(_res, &value, sizeof(value)); + #endif + + return res; +#endif +} + +SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_bfloat16, simde_bfloat16, uint16_t) + +#define SIMDE_NANBF simde_uint16_as_bfloat16(0xFFC1) // a quiet Not-a-Number +#define SIMDE_INFINITYBF simde_uint16_as_bfloat16(0x7F80) +#define SIMDE_NINFINITYBF simde_uint16_as_bfloat16(0xFF80) + +#define SIMDE_BFLOAT16_VALUE(value) simde_bfloat16_from_float32(SIMDE_FLOAT32_C(value)) + +#if !defined(simde_isinfbf) && defined(simde_math_isinff) + #define simde_isinfbf(a) simde_math_isinff(simde_bfloat16_to_float32(a)) +#endif +#if !defined(simde_isnanbf) && defined(simde_math_isnanf) + #define simde_isnanbf(a) simde_math_isnanf(simde_bfloat16_to_float32(a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_BFLOAT16_H) */ diff --git a/simde-common.h b/simde-common.h index 9c333e998..4a16200c1 100644 --- a/simde-common.h +++ b/simde-common.h @@ -22,6 +22,8 @@ * * Copyright: * 2017-2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_COMMON_H) @@ -30,8 +32,8 @@ #include "hedley.h" #define SIMDE_VERSION_MAJOR 0 -#define SIMDE_VERSION_MINOR 7 -#define SIMDE_VERSION_MICRO 3 +#define SIMDE_VERSION_MINOR 8 +#define SIMDE_VERSION_MICRO 4 #define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) // Also update meson.build in the root directory of the repository @@ -562,6 +564,61 @@ typedef SIMDE_FLOAT32_TYPE simde_float32; #endif typedef SIMDE_FLOAT64_TYPE simde_float64; +#if defined(SIMDE_POLY8_TYPE) +# undef SIMDE_POLY8_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +# define SIMDE_POLY8_TYPE poly8_t +# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(poly8_t, value)) +#else +# define SIMDE_POLY8_TYPE uint8_t +# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(uint8_t, value)) +#endif +typedef SIMDE_POLY8_TYPE simde_poly8; + +#if defined(SIMDE_POLY16_TYPE) +# undef SIMDE_POLY16_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +# define SIMDE_POLY16_TYPE poly16_t +# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(poly16_t, value)) +#else +# define SIMDE_POLY16_TYPE uint16_t +# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(uint16_t, value)) +#endif +typedef SIMDE_POLY16_TYPE simde_poly16; + +#if defined(SIMDE_POLY64_TYPE) +# undef SIMDE_POLY64_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) +# define SIMDE_POLY64_TYPE poly64_t +# define SIMDE_POLY64_C(value) (HEDLEY_STATIC_CAST(poly64_t, value ## ull)) +#else +# define SIMDE_POLY64_TYPE uint64_t +# define SIMDE_POLY64_C(value) value ## ull +#endif +typedef SIMDE_POLY64_TYPE simde_poly64; + +#if defined(SIMDE_POLY128_TYPE) +# undef SIMDE_POLY128_TYPE +#endif +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ // due to the __int128 below +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) +# define SIMDE_POLY128_TYPE poly128_t +# define SIMDE_POLY128_C(value) value +#elif defined(__SIZEOF_INT128__) +# define SIMDE_POLY128_TYPE __int128 +# define SIMDE_POLY128_C(value) (HEDLEY_STATIC_CAST(__int128, value)) +#else +# define SIMDE_POLY128_TYPE uint64_t +# define SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE 1 +#endif + +typedef SIMDE_POLY128_TYPE simde_poly128; +HEDLEY_DIAGNOSTIC_POP + #if defined(__cplusplus) typedef bool simde_bool; #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) @@ -712,6 +769,36 @@ typedef SIMDE_FLOAT64_TYPE simde_float64; #endif #endif +/*** Functions that quiet a signaling NaN ***/ + +static HEDLEY_INLINE +double +simde_math_quiet(double x) { + uint64_t tmp, mask; + if (!simde_math_isnan(x)) { + return x; + } + simde_memcpy(&tmp, &x, 8); + mask = 0x7ff80000; + mask <<= 32; + tmp |= mask; + simde_memcpy(&x, &tmp, 8); + return x; +} + +static HEDLEY_INLINE +float +simde_math_quietf(float x) { + uint32_t tmp; + if (!simde_math_isnanf(x)) { + return x; + } + simde_memcpy(&tmp, &x, 4); + tmp |= 0x7fc00000lu; + simde_memcpy(&x, &tmp, 4); + return x; +} + #if defined(FE_ALL_EXCEPT) #define SIMDE_HAVE_FENV_H #elif defined(__has_include) @@ -826,6 +913,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ #define SIMDE_BUILTIN_TYPE_64_ long long #endif +/* SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ */ +HEDLEY_DIAGNOSTIC_POP + #if defined(SIMDE_BUILTIN_SUFFIX_8_) #define SIMDE_BUILTIN_8_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_) #define SIMDE_BUILTIN_HAS_8_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)) @@ -899,6 +989,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(4,6,0) # define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ # endif +# if !HEDLEY_GCC_VERSION_CHECK(7,4,0) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,3,0)) +# define SIMDE_BUG_GCC_87467 +# endif # if !HEDLEY_GCC_VERSION_CHECK(8,0,0) # define SIMDE_BUG_GCC_REV_247851 # endif @@ -913,7 +1006,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_BAD_VEXT_REV32 # endif -# if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) +# if !(HEDLEY_GCC_VERSION_CHECK(9,4,0) \ + || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && !HEDLEY_GCC_VERSION_CHECK(9,0,0)) \ + ) && defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) # define SIMDE_BUG_GCC_94482 # endif # if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_ZARCH) @@ -926,10 +1021,12 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(11,2,0) # define SIMDE_BUG_GCC_95483 # endif -# if defined(__OPTIMIZE__) +# if defined(__OPTIMIZE__) && !HEDLEY_GCC_VERSION_CHECK(15,0,0) # define SIMDE_BUG_GCC_100927 # endif -# define SIMDE_BUG_GCC_98521 +# if !(HEDLEY_GCC_VERSION_CHECK(10,3,0)) +# define SIMDE_BUG_GCC_98521 +# endif # endif # if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_94488 @@ -937,18 +1034,35 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_REV_264019 # endif -# if defined(SIMDE_ARCH_ARM) +# if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) +# define SIMDE_BUG_GCC_REV_260989 +# endif +# if !HEDLEY_GCC_VERSION_CHECK(11,5,0) && (defined(SIMDE_ARCH_ARM4) || defined(SIMDE_ARCH_AARCH64)) +# define SIMDE_BUG_GCC_114521 +# endif +# if defined(SIMDE_ARCH_ARM) && !defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_95399 # define SIMDE_BUG_GCC_95471 -# elif defined(SIMDE_ARCH_POWER) +# define SIMDE_BUG_GCC_111609 +# if SIMDE_ARCH_ARM_CHECK(8,0) +# define SIMDE_BUG_GCC_113065 +# endif +# endif +# if defined(SIMDE_ARCH_POWER) # define SIMDE_BUG_GCC_95227 -# define SIMDE_BUG_GCC_95782 -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) +# if !HEDLEY_GCC_VERSION_CHECK(13,0,0) +# define SIMDE_BUG_GCC_95782 +# endif +# if !HEDLEY_GCC_VERSION_CHECK(12,0,0) +# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS +# endif +# endif +# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) # if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) # define SIMDE_BUG_GCC_96174 # endif -# elif defined(SIMDE_ARCH_ZARCH) +# endif +# if defined(SIMDE_ARCH_ZARCH) # define SIMDE_BUG_GCC_95782 # if HEDLEY_GCC_VERSION_CHECK(10,0,0) # define SIMDE_BUG_GCC_101614 @@ -956,21 +1070,30 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # endif # if defined(SIMDE_ARCH_MIPS_MSA) # define SIMDE_BUG_GCC_97248 -# define SIMDE_BUG_GCC_100760 -# define SIMDE_BUG_GCC_100761 -# define SIMDE_BUG_GCC_100762 +# if !HEDLEY_GCC_VERSION_CHECK(12,1,0) +# define SIMDE_BUG_GCC_100760 +# define SIMDE_BUG_GCC_100761 +# define SIMDE_BUG_GCC_100762 +# endif # endif -# define SIMDE_BUG_GCC_95399 -# if !defined(__OPTIMIZE__) +# if !defined(__OPTIMIZE__) && !(\ + HEDLEY_GCC_VERSION_CHECK(11,4,0) \ + || (HEDLEY_GCC_VERSION_CHECK(10,4,0) && !(HEDLEY_GCC_VERSION_CHECK(11,0,0))) \ + || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) # define SIMDE_BUG_GCC_105339 # endif # elif defined(__clang__) # if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_CLANG_45541 -# define SIMDE_BUG_CLANG_46844 -# define SIMDE_BUG_CLANG_48257 +# define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 +# define SIMDE_BUG_CLANG_71362 // https://github.com/llvm/llvm-project/issues/71362 +# define SIMDE_BUG_CLANG_71365 // https://github.com/llvm/llvm-project/issues/71365 +# define SIMDE_BUG_CLANG_71751 // https://github.com/llvm/llvm-project/issues/71751 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0) +# define SIMDE_BUG_CLANG_45541 +# endif # if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_CLANG_46840 +# define SIMDE_BUG_CLANG_46844 # endif # if SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) # define SIMDE_BUG_CLANG_BAD_VI64_OPS @@ -984,19 +1107,26 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) # define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES # endif +# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) +# define SIMDE_BUG_CLANG_71763 // https://github.com/llvm/llvm-project/issues/71763 +# endif # endif # if defined(SIMDE_ARCH_POWER) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_CLANG_46770 # endif # if defined(SIMDE_ARCH_POWER) && (SIMDE_ARCH_POWER == 700) && (SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)) -# define SIMDE_BUG_CLANG_50893 -# define SIMDE_BUG_CLANG_50901 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(13,0,0) +# define SIMDE_BUG_CLANG_50893 +# define SIMDE_BUG_CLANG_50901 +# endif # endif # if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) && !defined(__OPTIMIZE__) # define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT # endif # if defined(SIMDE_ARCH_POWER) -# define SIMDE_BUG_CLANG_50932 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(14,0,0) +# define SIMDE_BUG_CLANG_50932 +# endif # if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS # endif @@ -1023,9 +1153,12 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) # define SIMDE_BUG_CLANG_44589 # endif -# define SIMDE_BUG_CLANG_48673 +# define SIMDE_BUG_CLANG_48673 // https://github.com/llvm/llvm-project/issues/48017 +# endif +# define SIMDE_BUG_CLANG_45959 // https://github.com/llvm/llvm-project/issues/45304 +# if defined(SIMDE_ARCH_WASM_SIMD128) && !SIMDE_DETECT_CLANG_VERSION_CHECK(17,0,0) +# define SIMDE_BUG_CLANG_60655 # endif -# define SIMDE_BUG_CLANG_45959 # elif defined(HEDLEY_MSVC_VERSION) # if defined(SIMDE_ARCH_X86) # define SIMDE_BUG_MSVC_ROUND_EXTRACT @@ -1053,10 +1186,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ HEDLEY_GCC_VERSION_CHECK(4,3,0) # define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \ HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_POP \ _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \ __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \ - HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_POP \ simde_bug_ignore_sign_conversion_v_; \ })) #else @@ -1073,6 +1205,34 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value)) #endif +/* Initial support for RISCV V extensions based on ZVE64D. */ +#if defined(SIMDE_ARCH_RISCV_ZVE64D) && SIMDE_NATURAL_VECTOR_SIZE >= 64 && defined(__riscv_v_fixed_vlen) + #define RVV_FIXED_TYPE_DEF(name, lmul) \ + typedef vint8##name##_t fixed_vint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint16##name##_t fixed_vint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint32##name##_t fixed_vint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint8##name##_t fixed_vuint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint16##name##_t fixed_vuint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint32##name##_t fixed_vuint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat32##name##_t fixed_vfloat32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF(mf2, 1/2); + RVV_FIXED_TYPE_DEF(m1, 1); + RVV_FIXED_TYPE_DEF(m2, 2); + #define RVV_FIXED_TYPE_DEF_64B(name, lmul) \ + typedef vint64##name##_t fixed_vint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint64##name##_t fixed_vuint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat64##name##_t fixed_vfloat64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_64B(m1, 1); + RVV_FIXED_TYPE_DEF_64B(m2, 2); + #if defined(SIMDE_ARCH_RISCV_ZVFH) + #define RVV_FIXED_TYPE_DEF_16F(name, lmul) \ + typedef vfloat16##name##_t fixed_vfloat16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_16F(mf2, 1/2); + RVV_FIXED_TYPE_DEF_16F(m1, 1); + RVV_FIXED_TYPE_DEF_16F(m2, 2); + #endif +#endif + /* SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ */ HEDLEY_DIAGNOSTIC_POP diff --git a/simde-complex.h b/simde-complex.h index ce840e228..48ebe4cf1 100644 --- a/simde-complex.h +++ b/simde-complex.h @@ -26,7 +26,7 @@ /* Support for complex math. * - * We try to avoid inculding (in C++ mode) since it pulls in + * We try to avoid including (in C++ mode) since it pulls in * a *lot* of code. Unfortunately this only works for GNU modes (i.e., * -std=gnu++14 not -std=c++14) unless you pass -fext-numeric-literals, * but there is no way (AFAICT) to detect that flag so we have to rely diff --git a/simde-detect-clang.h b/simde-detect-clang.h index b28107459..7326f02db 100644 --- a/simde-detect-clang.h +++ b/simde-detect-clang.h @@ -54,10 +54,27 @@ * need more resolution I'm happy to accept patches that are able to * detect minor versions as well. That said, you'll probably have a * hard time with detection since AFAIK most minor releases don't add - * anything we can detect. */ + * anything we can detect. Updated based on + * https://github.com/google/highway/blob/438c705a295176b96a50336527bb3e7ea365ffac/hwy/detect_compiler_arch.h#L73 + * - would welcome patches/updates there as well. + */ #if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) -# if __has_warning("-Wformat-insufficient-args") +# if __has_warning("-Wmissing-designated-field-initializers") +# define SIMDE_DETECT_CLANG_VERSION 190000 +# elif __has_warning("-Woverriding-option") +# define SIMDE_DETECT_CLANG_VERSION 180000 +# elif __has_attribute(unsafe_buffer_usage) // no new warnings in 17.0 +# define SIMDE_DETECT_CLANG_VERSION 170000 +# elif __has_attribute(nouwtable) // no new warnings in 16.0 +# define SIMDE_DETECT_CLANG_VERSION 160000 +# elif __has_warning("-Warray-parameter") +# define SIMDE_DETECT_CLANG_VERSION 150000 +# elif __has_warning("-Wbitwise-instead-of-logical") +# define SIMDE_DETECT_CLANG_VERSION 140000 +# elif __has_warning("-Waix-compat") +# define SIMDE_DETECT_CLANG_VERSION 130000 +# elif __has_warning("-Wformat-insufficient-args") # define SIMDE_DETECT_CLANG_VERSION 120000 # elif __has_warning("-Wimplicit-const-int-float-conversion") # define SIMDE_DETECT_CLANG_VERSION 110000 @@ -67,7 +84,12 @@ # define SIMDE_DETECT_CLANG_VERSION 90000 # elif __has_warning("-Wextra-semi-stmt") || __has_builtin(__builtin_rotateleft32) # define SIMDE_DETECT_CLANG_VERSION 80000 -# elif __has_warning("-Wc++98-compat-extra-semi") +// For reasons unknown, Xcode 10.3 (Apple LLVM version 10.0.1) is apparently +// based on Clang 7, but does not support the warning we test. +// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and +// https://trac.macports.org/wiki/XcodeVersionInfo. +# elif __has_warning("-Wc++98-compat-extra-semi") || \ + (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) # define SIMDE_DETECT_CLANG_VERSION 70000 # elif __has_warning("-Wpragma-pack") # define SIMDE_DETECT_CLANG_VERSION 60000 diff --git a/simde-diagnostic.h b/simde-diagnostic.h index ff18172c6..6c7d2e732 100644 --- a/simde-diagnostic.h +++ b/simde-diagnostic.h @@ -272,7 +272,7 @@ #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ #endif -/* clang will emit this warning when we use C99 extensions whan not in +/* clang will emit this warning when we use C99 extensions when not in * C99 mode, even though it does support this. In such cases we check * the compiler and version first, so we know it's not a problem. */ #if HEDLEY_HAS_WARNING("-Wc99-extensions") @@ -400,6 +400,8 @@ * more elegantly, but until then... */ #if defined(HEDLEY_MSVC_VERSION) #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable:4702)) +#elif defined(__clang__) + #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ HEDLEY_PRAGMA(clang diagnostic ignored "-Wunreachable-code") #else #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ #endif diff --git a/simde-f16.h b/simde-f16.h index be5ebeacc..93afaf56f 100644 --- a/simde-f16.h +++ b/simde-f16.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #include "hedley.h" @@ -57,7 +58,7 @@ SIMDE_BEGIN_DECLS_ * that on Arm since it would break compatibility with the NEON F16 * functions. */ #define SIMDE_FLOAT16_API_FP16_NO_ABI 3 -/* This is basically __fp16 as specified by Arm, where arugments and +/* This is basically __fp16 as specified by Arm, where arguments and * return values are raw __fp16 values not structs. */ #define SIMDE_FLOAT16_API_FP16 4 @@ -65,16 +66,31 @@ SIMDE_BEGIN_DECLS_ * any ideas on how to improve it. If you do, patches are definitely * welcome. */ #if !defined(SIMDE_FLOAT16_API) - #if 0 && !defined(__cplusplus) - /* I haven't found a way to detect this. It seems like defining + #if defined(__ARM_FP16_FORMAT_IEEE) && (defined(SIMDE_ARM_NEON_FP16) || defined(__ARM_FP16_ARGS)) + #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 + #elif !defined(__EMSCRIPTEN__) && !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ + !(defined(HEDLEY_MSVC_VERSION) && defined(__clang__)) && \ + !(defined(SIMDE_ARCH_MIPS) && defined(__clang__)) && \ + !(defined(SIMDE_ARCH_ZARCH) && defined(__clang__)) && \ + !(defined(SIMDE_ARCH_LOONGARCH) && defined(__clang__)) && \ + !(defined(__clang__) && defined(SIMDE_ARCH_RISCV64)) && ( \ + defined(SIMDE_X86_AVX512FP16_NATIVE) || \ + (defined(SIMDE_ARCH_X86_SSE2) && HEDLEY_GCC_VERSION_CHECK(12,0,0)) || \ + (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !defined(__cplusplus)) || \ + (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(13,0,0)) || \ + ((defined(SIMDE_ARCH_X86_SSE2) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0)) || \ + (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) || \ + defined(SIMDE_ARCH_RISCV_ZVFH) + /* We haven't found a better way to detect this. It seems like defining * __STDC_WANT_IEC_60559_TYPES_EXT__, then including float.h, then * checking for defined(FLT16_MAX) should work, but both gcc and * clang will define the constants even if _Float16 is not * supported. Ideas welcome. */ #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FLOAT16 - #elif defined(__ARM_FP16_FORMAT_IEEE) && defined(SIMDE_ARM_NEON_FP16) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 - #elif defined(__FLT16_MIN__) && (defined(__clang__) && (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))) + #elif defined(__FLT16_MIN__) && \ + (defined(__clang__) && \ + (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) \ + && !defined(SIMDE_ARCH_RISCV64) && !defined(SIMDE_ARCH_LOONGARCH)) #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16_NO_ABI #else #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_PORTABLE @@ -82,17 +98,27 @@ SIMDE_BEGIN_DECLS_ #endif #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16 + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ // due to the _Float16 below typedef _Float16 simde_float16; - #define SIMDE_FLOAT16_C(value) value##f16 + HEDLEY_DIAGNOSTIC_POP + #define SIMDE_FLOAT16_IS_SCALAR 1 + #if !defined(__cplusplus) + #define SIMDE_FLOAT16_C(value) value##f16 + #else + #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(_Float16, (value)) + #endif #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI typedef struct { __fp16 value; } simde_float16; - #if defined(SIMDE_STATEMENT_EXPR_) + #if defined(SIMDE_STATEMENT_EXPR_) && !defined(SIMDE_TESTS_H) #define SIMDE_FLOAT16_C(value) (__extension__({ ((simde_float16) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ HEDLEY_STATIC_CAST(__fp16, (value)) }); HEDLEY_DIAGNOSTIC_POP })) #else #define SIMDE_FLOAT16_C(value) ((simde_float16) { HEDLEY_STATIC_CAST(__fp16, (value)) }) + #define SIMDE_FLOAT16_IS_SCALAR 1 #endif #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 typedef __fp16 simde_float16; + #define SIMDE_FLOAT16_IS_SCALAR 1 #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(__fp16, (value)) #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE typedef struct { uint16_t value; } simde_float16; @@ -114,12 +140,42 @@ SIMDE_BEGIN_DECLS_ SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16_as_uint16, uint16_t, simde_float16) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_float16, simde_float16, uint16_t) -#define SIMDE_NANHF simde_uint16_as_float16(0x7E00) -#define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) +#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE + #define SIMDE_NANHF simde_uint16_as_float16(0x7E00) // a quiet Not-a-Number + #define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) + #define SIMDE_NINFINITYHF simde_uint16_as_float16(0xFC00) +#else + #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI + #if SIMDE_MATH_BUILTIN_LIBM(nanf16) + #define SIMDE_NANHF SIMDE_FLOAT16_C(__builtin_nanf16("")) + #elif defined(SIMDE_MATH_NAN) + #define SIMDE_NANHF SIMDE_FLOAT16_C(SIMDE_MATH_NAN) + #endif + #if SIMDE_MATH_BUILTIN_LIBM(inf16) + #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(__builtin_inf16()) + #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-__builtin_inf16()) + #else + #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(SIMDE_MATH_INFINITY) + #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-SIMDE_MATH_INFINITY) + #endif + #else + #if SIMDE_MATH_BUILTIN_LIBM(nanf16) + #define SIMDE_NANHF __builtin_nanf16("") + #elif defined(SIMDE_MATH_NAN) + #define SIMDE_NANHF SIMDE_MATH_NAN + #endif + #if SIMDE_MATH_BUILTIN_LIBM(inf16) + #define SIMDE_INFINITYHF __builtin_inf16() + #define SIMDE_NINFINITYHF -(__builtin_inf16()) + #else + #define SIMDE_INFINITYHF HEDLEY_STATIC_CAST(simde_float16, SIMDE_MATH_INFINITY) + #define SIMDE_NINFINITYHF HEDLEY_STATIC_CAST(simde_float16, -SIMDE_MATH_INFINITY) + #endif + #endif +#endif /* Conversion -- convert between single-precision and half-precision * floats. */ - static HEDLEY_ALWAYS_INLINE HEDLEY_CONST simde_float16 simde_float16_from_float32 (simde_float32 value) { @@ -217,6 +273,54 @@ simde_float16_to_float32 (simde_float16 value) { #define SIMDE_FLOAT16_VALUE(value) simde_float16_from_float32(SIMDE_FLOAT32_C(value)) #endif +#if !defined(simde_isinfhf) && defined(simde_math_isinff) + #define simde_isinfhf(a) simde_math_isinff(simde_float16_to_float32(a)) +#endif +#if !defined(simde_isnanhf) && defined(simde_math_isnanf) + #define simde_isnanhf(a) simde_math_isnanf(simde_float16_to_float32(a)) +#endif +#if !defined(simde_isnormalhf) && defined(simde_math_isnormalf) + #define simde_isnormalhf(a) simde_math_isnormalf(simde_float16_to_float32(a)) +#endif +#if !defined(simde_issubnormalhf) && defined(simde_math_issubnormalf) + #define simde_issubnormalhf(a) simde_math_issubnormalf(simde_float16_to_float32(a)) +#endif + +#define simde_fpclassifyhf(a) simde_math_fpclassifyf(simde_float16_to_float32(a)) + +static HEDLEY_INLINE +uint8_t +simde_fpclasshf(simde_float16 v, const int imm8) { + uint16_t bits = simde_float16_as_uint16(v); + uint8_t negative = (bits >> 15) & 1; + uint16_t const ExpMask = 0x7C00; // [14:10] + uint16_t const MantMask = 0x03FF; // [9:0] + uint8_t exponent_all_ones = ((bits & ExpMask) == ExpMask); + uint8_t exponent_all_zeros = ((bits & ExpMask) == 0); + uint8_t mantissa_all_zeros = ((bits & MantMask) == 0); + uint8_t zero = exponent_all_zeros & mantissa_all_zeros; + uint8_t signaling_bit = (bits >> 9) & 1; + + uint8_t result = 0; + uint8_t snan = exponent_all_ones & (!mantissa_all_zeros) & (!signaling_bit); + uint8_t qnan = exponent_all_ones & (!mantissa_all_zeros) & signaling_bit; + uint8_t positive_zero = (!negative) & zero; + uint8_t negative_zero = negative & zero; + uint8_t positive_infinity = (!negative) & exponent_all_ones & mantissa_all_zeros; + uint8_t negative_infinity = negative & exponent_all_ones & mantissa_all_zeros; + uint8_t denormal = exponent_all_zeros & (!mantissa_all_zeros); + uint8_t finite_negative = negative & (!exponent_all_ones) & (!zero); + result = (((imm8 >> 0) & qnan) | \ + ((imm8 >> 1) & positive_zero) | \ + ((imm8 >> 2) & negative_zero) | \ + ((imm8 >> 3) & positive_infinity) | \ + ((imm8 >> 4) & negative_infinity) | \ + ((imm8 >> 5) & denormal) | \ + ((imm8 >> 6) & finite_negative) | \ + ((imm8 >> 7) & snan)); + return result; +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/simde-features.h b/simde-features.h index b34c52d8d..60d940981 100644 --- a/simde-features.h +++ b/simde-features.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* simde-arch.h is used to determine which features are available according @@ -39,9 +40,6 @@ #define SIMDE_X86_SVML_NATIVE #endif #endif -#if defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT) @@ -142,6 +140,15 @@ #define SIMDE_X86_AVX512F_NATIVE #endif +#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && !defined(SIMDE_X86_AVX512FP16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_AVX512FP16) + #define SIMDE_X86_AVX512FP16_NATIVE + #endif +#endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) + #define SIMDE_X86_AVX512F_NATIVE +#endif + #if !defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512BF16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_AVX512BF16) #define SIMDE_X86_AVX512BF16_NATIVE @@ -183,7 +190,7 @@ #define SIMDE_X86_AVX_NATIVE #endif #endif -#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) #define SIMDE_X86_SSE4_2_NATIVE #endif @@ -232,6 +239,15 @@ #define SIMDE_X86_SSE2_NATIVE #endif +#if !defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_AES_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_AES) + #define SIMDE_X86_AES_NATIVE + #endif +#endif +#if defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) + #define SIMDE_X86_SSE2_NATIVE +#endif + #if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_SSE2) #define SIMDE_X86_SSE2_NATIVE @@ -278,7 +294,7 @@ #endif #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(__INTEL_COMPILER) + #if defined(SIMDE_ARCH_X86) && (defined(__INTEL_COMPILER) || (HEDLEY_MSVC_VERSION_CHECK(14, 20, 0) && !defined(__clang__))) #define SIMDE_X86_SVML_NATIVE #endif #endif @@ -289,7 +305,7 @@ #endif #if \ - defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) + defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) #include #elif defined(SIMDE_X86_SSE4_2_NATIVE) #include @@ -315,6 +331,10 @@ #endif #endif +#if defined(SIMDE_X86_AES_NATIVE) + #include +#endif + #if defined(HEDLEY_MSVC_VERSION) #pragma warning(pop) #endif @@ -329,10 +349,13 @@ #endif #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(8,0) && (__ARM_NEON_FP & 0x02) + #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(8,0) && defined (__ARM_NEON_FP) && (__ARM_NEON_FP & 0x02) #define SIMDE_ARM_NEON_A32V8_NATIVE #endif #endif +#if defined(__ARM_ACLE) + #include +#endif #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define SIMDE_ARM_NEON_A32V7_NATIVE #endif @@ -356,12 +379,27 @@ #endif #endif +#if !defined(SIMDE_RISCV_V_NATIVE) && !defined(SIMDE_RISCV_V_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_RISCV_V) && defined(__riscv_v_fixed_vlen) + #define SIMDE_RISCV_V_NATIVE + #endif +#endif +#if defined(SIMDE_RISCV_V_NATIVE) + #include +#endif + #if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_WASM_SIMD128) #define SIMDE_WASM_SIMD128_NATIVE #endif #endif -#if defined(SIMDE_WASM_SIMD128_NATIVE) + +#if !defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) && !defined(SIMDE_WASM_RELAXED_SIMD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_WASM_RELAXED_SIMD) + #define SIMDE_WASM_RELAXED_SIMD_NATIVE + #endif +#endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) #include #endif @@ -515,12 +553,16 @@ defined(SIMDE_WASM_SIMD128_NATIVE) || \ defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) || \ defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || \ - defined(SIMDE_MIPS_MSA_NATIVE) + defined(SIMDE_MIPS_MSA_NATIVE) || \ + defined(SIMDE_LOONGARCH_LSX_NATIVE) #define SIMDE_NATURAL_VECTOR_SIZE (128) #elif defined(SIMDE_X86_SSE_NATIVE) #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (128) #define SIMDE_NATURAL_INT_VECTOR_SIZE (64) #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (0) + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(__riscv_v_fixed_vlen) + //FIXME : SIMDE_NATURAL_VECTOR_SIZE == __riscv_v_fixed_vlen + #define SIMDE_NATURAL_VECTOR_SIZE (128) #endif #if !defined(SIMDE_NATURAL_VECTOR_SIZE) @@ -617,12 +659,18 @@ #if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) #define SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) + #define SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_X86_AVX512DQ_NATIVE) #define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES #endif #if !defined(SIMDE_X86_AVX512CD_NATIVE) #define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AVX512FP16_NATIVE) + #define SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_X86_GFNI_NATIVE) #define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES #endif @@ -635,6 +683,12 @@ #if !defined(SIMDE_X86_F16C_NATIVE) #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AES_NATIVE) + #define SIMDE_X86_AES_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_X86_SVML_NATIVE) + #define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES @@ -650,6 +704,14 @@ #define SIMDE_ARM_SVE_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_RISCV_V_NATIVE) + #define SIMDE_RISCV_V_ENABLE_NATIVE_ALIASES + #endif + + #if !defined(SIMDE_MIPS_MSA_NATIVE) + #define SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_WASM_SIMD128_NATIVE) #define SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES #endif @@ -682,4 +744,27 @@ #define SIMDE_ARM_NEON_FP16 #endif +#if defined(SIMDE_ARCH_ARM_NEON_BF16) + #define SIMDE_ARM_NEON_BF16 +#endif + +#if !defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_LOONGARCH_LASX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_LOONGARCH_LASX) + #define SIMDE_LOONGARCH_LASX_NATIVE + #endif +#endif + +#if !defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_LOONGARCH_LSX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_LOONGARCH_LSX) + #define SIMDE_LOONGARCH_LSX_NATIVE + #endif +#endif + +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + #include +#endif +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + #include +#endif + #endif /* !defined(SIMDE_FEATURES_H) */ diff --git a/simde-math.h b/simde-math.h index 7e15a1c04..1dbf3bbae 100644 --- a/simde-math.h +++ b/simde-math.h @@ -22,6 +22,7 @@ * * Copyright: * 2017-2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* Attempt to find math functions. Functions may be in , @@ -174,7 +175,7 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #endif #endif -#if !defined(SIMDE_NANF) +#if !defined(SIMDE_MATH_NANF) #if \ HEDLEY_HAS_BUILTIN(__builtin_nanf) || \ HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ @@ -434,6 +435,91 @@ simde_math_fpclassify(double v) { #endif } +#define SIMDE_MATH_FP_QNAN 0x01 +#define SIMDE_MATH_FP_PZERO 0x02 +#define SIMDE_MATH_FP_NZERO 0x04 +#define SIMDE_MATH_FP_PINF 0x08 +#define SIMDE_MATH_FP_NINF 0x10 +#define SIMDE_MATH_FP_DENORMAL 0x20 +#define SIMDE_MATH_FP_NEGATIVE 0x40 +#define SIMDE_MATH_FP_SNAN 0x80 + +static HEDLEY_INLINE +uint8_t +simde_math_fpclassf(float v, const int imm8) { + union { + float f; + uint32_t u; + } fu; + fu.f = v; + uint32_t bits = fu.u; + uint8_t NegNum = (bits >> 31) & 1; + uint32_t const ExpMask = 0x3F800000; // [30:23] + uint32_t const MantMask = 0x007FFFFF; // [22:0] + uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); + uint8_t ExpAllZeros = ((bits & ExpMask) == 0); + uint8_t MantAllZeros = ((bits & MantMask) == 0); + uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; + uint8_t SignalingBit = (bits >> 22) & 1; + + uint8_t result = 0; + uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; + uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; + uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; + uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; + uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; + uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); + uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); + uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); + result = (((imm8 >> 0) & qNaN_res) | \ + ((imm8 >> 1) & Pzero_res) | \ + ((imm8 >> 2) & Nzero_res) | \ + ((imm8 >> 3) & Pinf_res) | \ + ((imm8 >> 4) & Ninf_res) | \ + ((imm8 >> 5) & Denorm_res) | \ + ((imm8 >> 6) & FinNeg_res) | \ + ((imm8 >> 7) & sNaN_res)); + return result; +} + +static HEDLEY_INLINE +uint8_t +simde_math_fpclass(double v, const int imm8) { + union { + double d; + uint64_t u; + } du; + du.d = v; + uint64_t bits = du.u; + uint8_t NegNum = (bits >> 63) & 1; + uint64_t const ExpMask = 0x3FF0000000000000; // [62:52] + uint64_t const MantMask = 0x000FFFFFFFFFFFFF; // [51:0] + uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); + uint8_t ExpAllZeros = ((bits & ExpMask) == 0); + uint8_t MantAllZeros = ((bits & MantMask) == 0); + uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; + uint8_t SignalingBit = (bits >> 51) & 1; + + uint8_t result = 0; + uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; + uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; + uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; + uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; + uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; + uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); + uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); + uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); + result = (((imm8 >> 0) & qNaN_res) | \ + ((imm8 >> 1) & Pzero_res) | \ + ((imm8 >> 2) & Nzero_res) | \ + ((imm8 >> 3) & Pinf_res) | \ + ((imm8 >> 4) & Ninf_res) | \ + ((imm8 >> 5) & Denorm_res) | \ + ((imm8 >> 6) & FinNeg_res) | \ + ((imm8 >> 7) & sNaN_res)); + return result; +} + /*** Manipulation functions ***/ #if !defined(simde_math_nextafter) @@ -706,6 +792,20 @@ simde_math_fpclassify(double v) { #endif #endif +#if !defined(simde_math_signbit) + #if SIMDE_MATH_BUILTIN_LIBM(signbit) + #if (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #define simde_math_signbit(x) __builtin_signbit(x) + #else + #define simde_math_signbit(x) __builtin_signbit(HEDLEY_STATIC_CAST(double, (x))) + #endif + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_signbit(x) std::signbit(x) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_signbit(x) signbit(x) + #endif +#endif + #if !defined(simde_math_cos) #if SIMDE_MATH_BUILTIN_LIBM(cos) #define simde_math_cos(v) __builtin_cos(v) @@ -852,16 +952,36 @@ simde_math_fpclassify(double v) { #endif #endif +#if !defined(simde_math_pow) + #if SIMDE_MATH_BUILTIN_LIBM(pow) + #define simde_math_pow(y, x) __builtin_pow(y, x) + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_pow(y, x) std::pow(y, x) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_pow(y, x) pow(y, x) + #endif +#endif + +#if !defined(simde_math_powf) + #if SIMDE_MATH_BUILTIN_LIBM(powf) + #define simde_math_powf(y, x) __builtin_powf(y, x) + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_powf(y, x) std::pow(y, x) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_powf(y, x) powf(y, x) + #endif +#endif + #if HEDLEY_HAS_BUILTIN(__builtin_exp10) || HEDLEY_GCC_VERSION_CHECK(3,4,0) # define simde_math_exp10(v) __builtin_exp10(v) #else -# define simde_math_exp10(v) pow(10.0, (v)) +# define simde_math_exp10(v) simde_math_pow(10.0, (v)) #endif #if HEDLEY_HAS_BUILTIN(__builtin_exp10f) || HEDLEY_GCC_VERSION_CHECK(3,4,0) # define simde_math_exp10f(v) __builtin_exp10f(v) #else -# define simde_math_exp10f(v) powf(10.0f, (v)) +# define simde_math_exp10f(v) simde_math_powf(10.0f, (v)) #endif #if !defined(simde_math_fabs) @@ -1104,26 +1224,6 @@ simde_math_fpclassify(double v) { #endif #endif -#if !defined(simde_math_pow) - #if SIMDE_MATH_BUILTIN_LIBM(pow) - #define simde_math_pow(y, x) __builtin_pow(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_pow(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_pow(y, x) pow(y, x) - #endif -#endif - -#if !defined(simde_math_powf) - #if SIMDE_MATH_BUILTIN_LIBM(powf) - #define simde_math_powf(y, x) __builtin_powf(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_powf(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_powf(y, x) powf(y, x) - #endif -#endif - #if !defined(simde_math_rint) #if SIMDE_MATH_BUILTIN_LIBM(rint) #define simde_math_rint(v) __builtin_rint(v) @@ -1166,7 +1266,7 @@ simde_math_fpclassify(double v) { #if !defined(simde_math_roundeven) #if \ - HEDLEY_HAS_BUILTIN(__builtin_roundeven) || \ + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundeven)) || \ HEDLEY_GCC_VERSION_CHECK(10,0,0) #define simde_math_roundeven(v) __builtin_roundeven(v) #elif defined(simde_math_round) && defined(simde_math_fabs) @@ -1186,7 +1286,7 @@ simde_math_fpclassify(double v) { #if !defined(simde_math_roundevenf) #if \ - HEDLEY_HAS_BUILTIN(__builtin_roundevenf) || \ + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundevenf)) || \ HEDLEY_GCC_VERSION_CHECK(10,0,0) #define simde_math_roundevenf(v) __builtin_roundevenf(v) #elif defined(simde_math_roundf) && defined(simde_math_fabsf) @@ -1264,6 +1364,16 @@ simde_math_fpclassify(double v) { #endif #endif +#if !defined(simde_math_sqrtl) + #if SIMDE_MATH_BUILTIN_LIBM(sqrtl) + #define simde_math_sqrtl(v) __builtin_sqrtl(v) + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_sqrtl(v) std::sqrt(v) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_sqrtl(v) sqrtl(v) + #endif +#endif + #if !defined(simde_math_tan) #if SIMDE_MATH_BUILTIN_LIBM(tan) #define simde_math_tan(v) __builtin_tan(v) @@ -1399,15 +1509,12 @@ simde_math_fpclassify(double v) { #define simde_math_cdfnormf simde_math_cdfnormf #endif -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ - #if !defined(simde_math_cdfnorminv) && defined(simde_math_log) && defined(simde_math_sqrt) /*https://web.archive.org/web/20150910081113/http://home.online.no/~pjacklam/notes/invnorm/impl/sprouse/ltqnorm.c*/ static HEDLEY_INLINE double simde_math_cdfnorminv(double p) { - static const double a[] = { + static const double a[6] = { -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, @@ -1416,7 +1523,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 2.506628277459239e+00 }; - static const double b[] = { + static const double b[5] = { -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, @@ -1424,7 +1531,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -1.328068155288572e+01 }; - static const double c[] = { + static const double c[6] = { -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, @@ -1433,7 +1540,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 2.938163982698783e+00 }; - static const double d[] = { + static const double d[4] = { 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, @@ -1474,7 +1581,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ static HEDLEY_INLINE float simde_math_cdfnorminvf(float p) { - static const float a[] = { + static const float a[6] = { -3.969683028665376e+01f, 2.209460984245205e+02f, -2.759285104469687e+02f, @@ -1482,14 +1589,14 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -3.066479806614716e+01f, 2.506628277459239e+00f }; - static const float b[] = { + static const float b[5] = { -5.447609879822406e+01f, 1.615858368580409e+02f, -1.556989798598866e+02f, 6.680131188771972e+01f, -1.328068155288572e+01f }; - static const float c[] = { + static const float c[6] = { -7.784894002430293e-03f, -3.223964580411365e-01f, -2.400758277161838e+00f, @@ -1497,7 +1604,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 4.374664141464968e+00f, 2.938163982698783e+00f }; - static const float d[] = { + static const float d[4] = { 7.784695709041462e-03f, 3.224671290700398e-01f, 2.445134137142996e+00f, @@ -1584,7 +1691,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ if(x >= 0.0625 && x < 2.0) { return simde_math_erfinv(1.0 - x); } else if (x < 0.0625 && x >= 1.0e-100) { - double p[6] = { + static const double p[6] = { 0.1550470003116, 1.382719649631, 0.690969348887, @@ -1592,7 +1699,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 0.680544246825, -0.16444156791 }; - double q[3] = { + static const double q[3] = { 0.155024849822, 1.385228141995, 1.000000000000 @@ -1602,13 +1709,13 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / (q[0] + t * (q[1] + t * (q[2]))); } else if (x < 1.0e-100 && x >= SIMDE_MATH_DBL_MIN) { - double p[4] = { + static const double p[4] = { 0.00980456202915, 0.363667889171, 0.97302949837, -0.5374947401 }; - double q[3] = { + static const double q[3] = { 0.00980451277802, 0.363699971544, 1.000000000000 @@ -1639,7 +1746,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 1.382719649631f, 0.690969348887f, -1.128081391617f, - 0.680544246825f + 0.680544246825f, -0.164441567910f }; static const float q[3] = { @@ -1675,8 +1782,6 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ #define simde_math_erfcinvf simde_math_erfcinvf #endif -HEDLEY_DIAGNOSTIC_POP - static HEDLEY_INLINE double simde_math_rad2deg(double radians) { diff --git a/wasm/relaxed-simd.h b/wasm/relaxed-simd.h index 3bfcc902a..b610eb08c 100644 --- a/wasm/relaxed-simd.h +++ b/wasm/relaxed-simd.h @@ -37,8 +37,10 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i8x16_swizzle_relaxed (simde_v128_t a, simde_v128_t b) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i8x16_relaxed_swizzle(simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i8x16_relaxed_swizzle(a, b); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i8x16_swizzle(a, b); #else simde_v128_private @@ -71,15 +73,17 @@ simde_wasm_i8x16_swizzle_relaxed (simde_v128_t a, simde_v128_t b) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i8x16_swizzle_relaxed(a, b) simde_wasm_i8x16_swizzle_relaxed((a), (b)) + #define wasm_i8x16_relaxed_swizzle(a, b) simde_wasm_i8x16_relaxed_swizzle((a), (b)) #endif /* Conversions */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_trunc_f32x4 (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i32x4_relaxed_trunc_f32x4 (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i32x4_relaxed_trunc_f32x4(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i32x4_trunc_sat_f32x4(a); #else simde_v128_private @@ -107,13 +111,15 @@ simde_wasm_i32x4_trunc_f32x4 (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_trunc_f32x4(a) simde_wasm_i32x4_trunc_f32x4((a)) + #define wasm_i32x4_relaxed_trunc_f32x4(a) simde_wasm_i32x4_relaxed_trunc_f32x4((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_u32x4_trunc_f32x4 (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_u32x4_relaxed_trunc_f32x4 (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_u32x4_relaxed_trunc_f32x4(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_u32x4_trunc_sat_f32x4(a); #else simde_v128_private @@ -152,13 +158,15 @@ simde_wasm_u32x4_trunc_f32x4 (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_u32x4_trunc_f32x4(a) simde_wasm_u32x4_trunc_f32x4((a)) + #define wasm_u32x4_relaxed_trunc_f32x4(a) simde_wasm_u32x4_relaxed_trunc_f32x4((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_trunc_f64x2_zero (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i32x4_relaxed_trunc_f64x2_zero (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i32x4_relaxed_trunc_f64x2_zero(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i32x4_trunc_sat_f64x2_zero(a); #else simde_v128_private @@ -209,13 +217,15 @@ simde_wasm_i32x4_trunc_f64x2_zero (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_trunc_f64x2_zero(a) simde_wasm_i32x4_trunc_f64x2_zero((a)) + #define wasm_i32x4_relaxed_trunc_f64x2_zero(a) simde_wasm_i32x4_relaxed_trunc_f64x2_zero((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_u32x4_trunc_f64x2_zero (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_u32x4_relaxed_trunc_f64x2_zero (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_u32x4_relaxed_trunc_f64x2_zero(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_u32x4_trunc_sat_f64x2_zero(a); #else simde_v128_private @@ -254,14 +264,14 @@ simde_wasm_u32x4_trunc_f64x2_zero (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_u32x4_trunc_f64x2_zero(a) simde_wasm_u32x4_trunc_f64x2_zero((a)) + #define wasm_u32x4_relaxed_trunc_f64x2_zero(a) simde_wasm_u32x4_relaxed_trunc_f64x2_zero((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i8x16_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i8x16_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i8x16_blend(a, b, mask); + return wasm_i8x16_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -276,15 +286,15 @@ simde_wasm_i8x16_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i8x16_blend(a, b, c) simde_wasm_i8x16_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i8x16_relaxed_laneselect(a, b, mask) simde_wasm_i8x16_relaxed_laneselect((a), (b), (mask)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i16x8_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i16x8_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i16x8_blend(a, b, mask); + return wasm_i16x8_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -299,15 +309,15 @@ simde_wasm_i16x8_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i16x8_blend(a, b, c) simde_wasm_i16x8_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i16x8_relaxed_laneselect(a, b, mask) simde_wasm_i16x8_relaxed_laneselect((a), (b), (mask)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i32x4_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i32x4_blend(a, b, mask); + return wasm_i32x4_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -322,15 +332,15 @@ simde_wasm_i32x4_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_blend(a, b, c) simde_wasm_i32x4_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i32x4_relaxed_laneselect(a, b, c) simde_wasm_i32x4_relaxed_laneselect((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i64x2_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i64x2_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i64x2_blend(a, b, mask); + return wasm_i64x2_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -345,19 +355,19 @@ simde_wasm_i64x2_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i64x2_blend(a, b, c) simde_wasm_i64x2_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_relaxed_laneselect(a, b, mask) simde_wasm_i64x2_relaxed_laneselect((a), (b), (mask)) #endif /* fma */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f32x4_relaxed_madd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fma(a, b, c); + return wasm_f32x4_relaxed_madd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_add(a, wasm_f32x4_mul(b, c)); + return wasm_f32x4_add(wasm_f32x4_mul(a, b), c); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -366,19 +376,21 @@ simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_madd(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) - r_.neon_f32 = vfmaq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); + r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + r_.neon_f32 = vfmaq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlaq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); + r_.neon_f32 = vmlaq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128 = _mm_fmadd_ps(c_.sse_m128, b_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_fmadd_ps(a_.sse_m128, b_.sse_m128, c_.sse_m128); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v4f32 = __msa_fmadd_w(c_.msa_v4f32, a_.msa_v4f32, b_.msa_v4f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f32 = a_.f32 + (b_.f32 * c_.f32); + r_.f32 = (a_.f32 * b_.f32) + c_.f32; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fmaf(c_.f32[i], b_.f32[i], a_.f32[i]); + r_.f32[i] = simde_math_fmaf(a_.f32[i], b_.f32[i], c_.f32[i]); } #endif @@ -386,16 +398,16 @@ simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f32x4_fma(a, b) simde_wasm_f32x4_fma((a), (b)) + #define wasm_f32x4_relaxed_madd(a, b, c) simde_wasm_f32x4_relaxed_madd((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f64x2_relaxed_madd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f64x2_fma(a, b, c); + return wasm_f64x2_relaxed_madd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_add(a, wasm_f64x2_mul(b, c)); + return wasm_f64x2_add(wasm_f64x2_mul(a, b), c); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -404,17 +416,19 @@ simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_madd(c_.altivec_f64, b_.altivec_f64, a_.altivec_f64); + r_.altivec_f64 = vec_madd(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmaq_f64(a_.neon_f64, c_.neon_f64, b_.neon_f64); + r_.neon_f64 = vfmaq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128d = _mm_fmadd_pd(c_.sse_m128d, b_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_fmadd_pd(a_.sse_m128d, b_.sse_m128d, c_.sse_m128d); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v2f64 = __msa_fmadd_d(c_.msa_v2f64, a_.msa_v2f64, b_.msa_v2f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f64 = a_.f64 + (b_.f64 * c_.f64); + r_.f64 = (a_.f64 * b_.f64) + c_.f64; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fma(c_.f64[i], b_.f64[i], a_.f64[i]); + r_.f64[i] = simde_math_fma(a_.f64[i], b_.f64[i], c_.f64[i]); } #endif @@ -422,18 +436,18 @@ simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f64x2_fma(a, b) simde_wasm_f64x2_fma((a), (b)) + #define wasm_f64x2_relaxed_madd(a, b, c) simde_wasm_f64x2_relaxed_madd((a), (b), (c)) #endif /* fms */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f32x4_relaxed_nmadd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fms(a, b, c); + return wasm_f32x4_relaxed_nmadd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_sub(a, wasm_f32x4_mul(b, c)); + return wasm_f32x4_sub(c, wasm_f32x4_mul(a, b)); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -442,19 +456,21 @@ simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_nmsub(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) - r_.neon_f32 = vfmsq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); + r_.altivec_f32 = vec_nmsub(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlsq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); + r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128 = _mm_fnmadd_ps(c_.sse_m128, b_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_fnmadd_ps(a_.sse_m128, b_.sse_m128, c_.sse_m128); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v4f32 = __msa_fmsub_w(c_.msa_v4f32, a_.msa_v4f32, b_.msa_v4f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f32 = a_.f32 - (b_.f32 * c_.f32); + r_.f32 = c_.f32 - (a_.f32 * b_.f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - (b_.f32[i] * c_.f32[i]); + r_.f32[i] = c_.f32[i] - (a_.f32[i] * b_.f32[i]); } #endif @@ -462,16 +478,16 @@ simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f32x4_fms(a, b) simde_wasm_f32x4_fms((a), (b)) + #define wasm_f32x4_relaxed_nmadd(a, b, c) simde_wasm_f32x4_relaxed_nmadd((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f64x2_relaxed_nmadd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f64x2_fms(a, b, c); + return wasm_f64x2_relaxed_nmadd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_sub(a, wasm_f64x2_mul(b, c)); + return wasm_f64x2_sub(c, wasm_f64x2_mul(a, b)); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -480,17 +496,19 @@ simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_nmsub(c_.altivec_f64, b_.altivec_f64, a_.altivec_f64); + r_.altivec_f64 = vec_nmsub(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmsq_f64(a_.neon_f64, c_.neon_f64, b_.neon_f64); + r_.neon_f64 = vfmsq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128d = _mm_fnmadd_pd(c_.sse_m128d, b_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_fnmadd_pd(a_.sse_m128d, b_.sse_m128d, c_.sse_m128d); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v2f64 = __msa_fmsub_d(c_.msa_v2f64, a_.msa_v2f64, b_.msa_v2f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f64 = a_.f64 - (b_.f64 * c_.f64); + r_.f64 = c_.f64 - (a_.f64 * b_.f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - (b_.f64[i] * c_.f64[i]); + r_.f64[i] = c_.f64[i] - (a_.f64[i] * b_.f64[i]); } #endif @@ -498,7 +516,89 @@ simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f64x2_fms(a, b) simde_wasm_f64x2_fms((a), (b)) + #define wasm_f64x2_relaxed_nmadd(a, b, c) simde_wasm_f64x2_relaxed_nmadd((a), (b), (c)) +#endif + +/* min/max */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f32x4_relaxed_min (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f32x4_relaxed_min(a, b); + #elif defined(SIMDE_X86_SSE_NATIVE) + return simde_v128_from_m128(_mm_min_ps(simde_v128_to_m128(a), simde_v128_to_m128(b))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return simde_v128_from_neon_f32(vminq_f32(simde_v128_to_neon_f32(a), simde_v128_to_neon_f32(b))); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + return simde_v128_from_altivec_f32(vec_min(simde_v128_to_altivec_f32(a), simde_v128_to_altivec_f32(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v4f32(__msa_fmin_w(simde_v128_to_msa_v4f32(a), simde_v128_to_msa_v4f32(b))); + #else + return simde_wasm_f32x4_min(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_min(a, b) simde_wasm_f32x4_relaxed_min((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f32x4_relaxed_max (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f32x4_relaxed_max(a, b); + #elif defined(SIMDE_X86_SSE_NATIVE) + return simde_v128_from_m128(_mm_max_ps(simde_v128_to_m128(a), simde_v128_to_m128(b))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return simde_v128_from_neon_f32(vmaxq_f32(simde_v128_to_neon_f32(a), simde_v128_to_neon_f32(b))); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + return simde_v128_from_altivec_f32(vec_max(simde_v128_to_altivec_f32(a), simde_v128_to_altivec_f32(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v4f32(__msa_fmax_w(simde_v128_to_msa_v4f32(a), simde_v128_to_msa_v4f32(b))); + #else + return simde_wasm_f32x4_max(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_max(a, b) simde_wasm_f32x4_relaxed_max((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f64x2_relaxed_min (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f64x2_relaxed_min(a, b); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return simde_v128_from_m128d(_mm_min_pd(simde_v128_to_m128d(a), simde_v128_to_m128d(b))); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return simde_v128_from_neon_f64(vminq_f64(simde_v128_to_neon_f64(a), simde_v128_to_neon_f64(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v2f64(__msa_fmin_d(simde_v128_to_msa_v2f64(a), simde_v128_to_msa_v2f64(b))); + #else + return simde_wasm_f64x2_min(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f64x2_relaxed_min(a, b) simde_wasm_f64x2_relaxed_min((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f64x2_relaxed_max (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f64x2_relaxed_max(a, b); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return simde_v128_from_m128d(_mm_max_pd(simde_v128_to_m128d(a), simde_v128_to_m128d(b))); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return simde_v128_from_neon_f64(vmaxq_f64(simde_v128_to_neon_f64(a), simde_v128_to_neon_f64(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v2f64(__msa_fmax_d(simde_v128_to_msa_v2f64(a), simde_v128_to_msa_v2f64(b))); + #else + return simde_wasm_f32x4_max(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_max(a, b) simde_wasm_f64x2_relaxed_max((a), (b)) #endif SIMDE_END_DECLS_ diff --git a/wasm/simd128.h b/wasm/simd128.h index 0433fc071..51bf7e8bf 100644 --- a/wasm/simd128.h +++ b/wasm/simd128.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2023 Michael R. Crusoe */ #if !defined(SIMDE_WASM_SIMD128_H) @@ -91,6 +92,17 @@ typedef union { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) SIMDE_ALIGN_TO_16 v128_t wasm_v128; + #elif defined(SIMDE_MIPS_MSA_NATIVE) + SIMDE_ALIGN_TO_16 v16i8 msa_v16i8; + SIMDE_ALIGN_TO_16 v8i16 msa_v8i16; + SIMDE_ALIGN_TO_16 v4i32 msa_v4i32; + SIMDE_ALIGN_TO_16 v2i64 msa_v2i64; + SIMDE_ALIGN_TO_16 v16u8 msa_v16u8; + SIMDE_ALIGN_TO_16 v8u16 msa_v8u16; + SIMDE_ALIGN_TO_16 v4u32 msa_v4u32; + SIMDE_ALIGN_TO_16 v2u64 msa_v2u64; + SIMDE_ALIGN_TO_16 v4f32 msa_v4f32; + SIMDE_ALIGN_TO_16 v2f64 msa_v2f64; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; @@ -110,13 +122,15 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) typedef v128_t simde_v128_t; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int32x4_t simde_v128_t; + typedef int32x4_t simde_v128_t; #elif defined(SIMDE_X86_SSE2_NATIVE) - typedef __m128i simde_v128_t; + typedef __m128i simde_v128_t; #elif defined(SIMDE_X86_SSE_NATIVE) - typedef __m128 simde_v128_t; + typedef __m128 simde_v128_t; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(signed int) simde_v128_t; + typedef SIMDE_POWER_ALTIVEC_VECTOR(signed int) simde_v128_t; +#elif defined(SIMDE_MIPS_MSA_NATIVE) + typedef v4i32 simde_v128_t; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef int32_t simde_v128_t SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #else @@ -151,8 +165,34 @@ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde_v128_private) == 16, "simde_v128_priva SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(simde_v128_private, simde_v128_t, simde_v128_to_private, simde_v128_from_private) -#if defined(SIMDE_X86_SSE2_NATIVE) +#define SIMDE_WASM_SIMD128_FMIN(x, y) \ + (simde_math_isnan(x) ? SIMDE_MATH_NAN \ + : simde_math_isnan(y) ? SIMDE_MATH_NAN \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (x) : (y)) \ + : ((x) < (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMAX(x, y) \ + (simde_math_isnan(x) ? SIMDE_MATH_NAN \ + : simde_math_isnan(y) ? SIMDE_MATH_NAN \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (y) : (x)) \ + : ((x) > (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMINF(x, y) \ + (simde_math_isnanf(x) ? SIMDE_MATH_NANF \ + : simde_math_isnanf(y) ? SIMDE_MATH_NANF \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (x) : (y)) \ + : ((x) < (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMAXF(x, y) \ + (simde_math_isnanf(x) ? SIMDE_MATH_NANF \ + : simde_math_isnanf(y) ? SIMDE_MATH_NANF \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (y) : (x)) \ + : ((x) > (y) ? (x) : (y))) + +#if defined(SIMDE_X86_SSE_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128 , simde_v128_t, simde_v128_to_m128 , simde_v128_from_m128 ) +#endif +#if defined(SIMDE_X86_SSE2_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128i, simde_v128_t, simde_v128_to_m128i, simde_v128_from_m128i) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128d, simde_v128_t, simde_v128_to_m128d, simde_v128_from_m128d) #endif @@ -172,6 +212,19 @@ SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(simde_v128_private, simde_v128_ #endif #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ +#if defined(SIMDE_MIPS_MSA_NATIVE) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v16i8, simde_v128_t, simde_v128_to_msa_v16i8, simde_v128_from_msa_v16i8) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v8i16, simde_v128_t, simde_v128_to_msa_v8i16, simde_v128_from_msa_v8i16) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4i32, simde_v128_t, simde_v128_to_msa_v4i32, simde_v128_from_msa_v4i32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2i64, simde_v128_t, simde_v128_to_msa_v2i64, simde_v128_from_msa_v2i64) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v16u8, simde_v128_t, simde_v128_to_msa_v16u8, simde_v128_from_msa_v16u8) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v8u16, simde_v128_t, simde_v128_to_msa_v8u16, simde_v128_from_msa_v8u16) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4u32, simde_v128_t, simde_v128_to_msa_v4u32, simde_v128_from_msa_v4u32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2u64, simde_v128_t, simde_v128_to_msa_v2u64, simde_v128_from_msa_v2u64) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4f32, simde_v128_t, simde_v128_to_msa_v4f32, simde_v128_from_msa_v4f32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2f64, simde_v128_t, simde_v128_to_msa_v2f64, simde_v128_from_msa_v2f64) +#endif + #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(SIMDE_POWER_ALTIVEC_VECTOR( signed char), simde_v128_t, simde_v128_to_altivec_i8 , simde_v128_from_altivec_i8 ) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(SIMDE_POWER_ALTIVEC_VECTOR( signed short), simde_v128_t, simde_v128_to_altivec_i16, simde_v128_from_altivec_i16) @@ -294,6 +347,55 @@ simde_wasm_i8x16_make ( (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u8x16_make ( + uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4, uint8_t c5, uint8_t c6, uint8_t c7, + uint8_t c8, uint8_t c9, uint8_t c10, uint8_t c11, uint8_t c12, uint8_t c13, uint8_t c14, uint8_t c15) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return + wasm_u8x16_make( + c0, c1, c2, c3, c4, c5, c6, c7, + c8, c9, c10, c11, c12, c13, c14, c15); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi8( + HEDLEY_STATIC_CAST(char, c15), HEDLEY_STATIC_CAST(char, c14), HEDLEY_STATIC_CAST(char, c13), HEDLEY_STATIC_CAST(char, c12), + HEDLEY_STATIC_CAST(char, c11), HEDLEY_STATIC_CAST(char, c10), HEDLEY_STATIC_CAST(char, c9), HEDLEY_STATIC_CAST(char, c8), + HEDLEY_STATIC_CAST(char, c7), HEDLEY_STATIC_CAST(char, c6), HEDLEY_STATIC_CAST(char, c5), HEDLEY_STATIC_CAST(char, c4), + HEDLEY_STATIC_CAST(char, c3), HEDLEY_STATIC_CAST(char, c2), HEDLEY_STATIC_CAST(char, c1), HEDLEY_STATIC_CAST(char, c0)); + #else + simde_v128_private r_; + + r_.u8[ 0] = c0; + r_.u8[ 1] = c1; + r_.u8[ 2] = c2; + r_.u8[ 3] = c3; + r_.u8[ 4] = c4; + r_.u8[ 5] = c5; + r_.u8[ 6] = c6; + r_.u8[ 7] = c7; + r_.u8[ 8] = c8; + r_.u8[ 9] = c9; + r_.u8[10] = c10; + r_.u8[11] = c11; + r_.u8[12] = c12; + r_.u8[13] = c13; + r_.u8[14] = c14; + r_.u8[15] = c15; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u8x16_make( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + simde_wasm_u8x16_make( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i16x8_make ( @@ -323,6 +425,37 @@ simde_wasm_i16x8_make ( simde_wasm_i16x8_make((c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u16x8_make ( + uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint16_t c4, uint16_t c5, uint16_t c6, uint16_t c7) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi16( + HEDLEY_STATIC_CAST(short, c7), HEDLEY_STATIC_CAST(short, c6), HEDLEY_STATIC_CAST(short, c5), HEDLEY_STATIC_CAST(short, c4), + HEDLEY_STATIC_CAST(short, c3), HEDLEY_STATIC_CAST(short, c2), HEDLEY_STATIC_CAST(short, c1), HEDLEY_STATIC_CAST(short, c0)); + #else + simde_v128_private r_; + + r_.u16[0] = c0; + r_.u16[1] = c1; + r_.u16[2] = c2; + r_.u16[3] = c3; + r_.u16[4] = c4; + r_.u16[5] = c5; + r_.u16[6] = c6; + r_.u16[7] = c7; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7) \ + simde_wasm_u16x8_make((c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i32x4_make (int32_t c0, int32_t c1, int32_t c2, int32_t c3) { @@ -345,6 +478,30 @@ simde_wasm_i32x4_make (int32_t c0, int32_t c1, int32_t c2, int32_t c3) { #define wasm_i32x4_make(c0, c1, c2, c3) simde_wasm_i32x4_make((c0), (c1), (c2), (c3)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u32x4_make (uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u32x4_make(c0, c1, c2, c3); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi32( + HEDLEY_STATIC_CAST(int, c3), HEDLEY_STATIC_CAST(int, c2), HEDLEY_STATIC_CAST(int, c1), HEDLEY_STATIC_CAST(int, c0)); + #else + simde_v128_private r_; + + r_.u32[0] = c0; + r_.u32[1] = c1; + r_.u32[2] = c2; + r_.u32[3] = c3; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_make(c0, c1, c2, c3) simde_wasm_u32x4_make((c0), (c1), (c2), (c3)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_make (int64_t c0, int64_t c1) { @@ -355,8 +512,8 @@ simde_wasm_i64x2_make (int64_t c0, int64_t c1) { #else simde_v128_private r_; - r_.i64[ 0] = c0; - r_.i64[ 1] = c1; + r_.i64[0] = c0; + r_.i64[1] = c1; return simde_v128_from_private(r_); #endif @@ -365,6 +522,27 @@ simde_wasm_i64x2_make (int64_t c0, int64_t c1) { #define wasm_i64x2_make(c0, c1) simde_wasm_i64x2_make((c0), (c1)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u64x2_make (uint64_t c0, uint64_t c1) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u64x2_make(c0, c1); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, c1), HEDLEY_STATIC_CAST(int64_t, c0)); + #else + simde_v128_private r_; + + r_.u64[0] = c0; + r_.u64[1] = c1; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_make(c0, c1) simde_wasm_u64x2_make((c0), (c1)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_f32x4_make (simde_float32 c0, simde_float32 c1, simde_float32 c2, simde_float32 c3) { @@ -469,6 +647,62 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + wasm_u8x16_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + SIMDE_ASSERT_CONSTANT_(c4); \ + SIMDE_ASSERT_CONSTANT_(c5); \ + SIMDE_ASSERT_CONSTANT_(c6); \ + SIMDE_ASSERT_CONSTANT_(c7); \ + SIMDE_ASSERT_CONSTANT_(c8); \ + SIMDE_ASSERT_CONSTANT_(c9); \ + SIMDE_ASSERT_CONSTANT_(c10); \ + SIMDE_ASSERT_CONSTANT_(c11); \ + SIMDE_ASSERT_CONSTANT_(c12); \ + SIMDE_ASSERT_CONSTANT_(c13); \ + SIMDE_ASSERT_CONSTANT_(c13); \ + SIMDE_ASSERT_CONSTANT_(c15); \ + \ + simde_wasm_u8x16_make( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u8x16_const ( + uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4, uint8_t c5, uint8_t c6, uint8_t c7, + uint8_t c8, uint8_t c9, uint8_t c10, uint8_t c11, uint8_t c12, uint8_t c13, uint8_t c14, uint8_t c15) { + return simde_wasm_u8x16_make( + c0, c1, c2, c3, c4, c5, c6, c7, + c8, c9, c10, c11, c12, c13, c14, c15); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + simde_wasm_u8x16_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i16x8_const( \ @@ -509,6 +743,46 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + wasm_u16x8_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + SIMDE_ASSERT_CONSTANT_(c4); \ + SIMDE_ASSERT_CONSTANT_(c5); \ + SIMDE_ASSERT_CONSTANT_(c6); \ + SIMDE_ASSERT_CONSTANT_(c7); \ + \ + simde_wasm_u16x8_make( \ + c0, c1, c2, c3, c4, c5, c6, c7); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u16x8_const ( + uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint16_t c4, uint16_t c5, uint16_t c6, uint16_t c7) { + return simde_wasm_u16x8_make( + c0, c1, c2, c3, c4, c5, c6, c7); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + simde_wasm_u16x8_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i32x4_const( \ @@ -545,6 +819,42 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1), (c2), (c3)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u32x4_const( \ + c0, c1, c2, c3) \ + wasm_u32x4_const( \ + (c0), (c1), (c2), (c3)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u32x4_const( \ + c0, c1, c2, c3) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + \ + simde_wasm_u32x4_make( \ + c0, c1, c2, c3); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u32x4_const ( + uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) { + return simde_wasm_u32x4_make( + c0, c1, c2, c3); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u32x4_const( \ + c0, c1, c2, c3) \ + simde_wasm_u32x4_const( \ + (c0), (c1), (c2), (c3)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i64x2_const( \ @@ -579,6 +889,40 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u64x2_const( \ + c0, c1) \ + wasm_u64x2_const( \ + (c0), (c1)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u64x2_const( \ + c0, c1) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + \ + simde_wasm_u64x2_make( \ + c0, c1); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u64x2_const ( + uint64_t c0, uint64_t c1) { + return simde_wasm_u64x2_make( + c0, c1); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u64x2_const( \ + c0, c1) \ + simde_wasm_u64x2_const( \ + (c0), (c1)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_f32x4_const( \ @@ -642,11 +986,7 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { } #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define \ - wasm_f64x2_const( \ - c0, c1) \ - simde_wasm_f64x2_const( \ - (c0), (c1)) + #define wasm_f64x2_const(c0, c1) simde_wasm_f64x2_const((c0), (c1)) #endif /* splat */ @@ -679,6 +1019,52 @@ simde_wasm_i8x16_splat (int8_t a) { #define wasm_i8x16_splat(a) simde_wasm_i8x16_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u8x16_splat (uint8_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u8x16_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vdupq_n_u8(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u8 = vec_splats(HEDLEY_STATIC_CAST(unsigned char, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u8x16_splat(a) simde_wasm_u8x16_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i8x16_const_splat(a) wasm_i8x16_const_splat((a)) +#else + #define simde_wasm_i8x16_const_splat(a) simde_wasm_i8x16_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i8x16_const_splat(a) simde_wasm_i8x16_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u8x16_const_splat(a) wasm_u8x16_const_splat((a)) +#else + #define simde_wasm_u8x16_const_splat(a) simde_wasm_u8x16_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u8x16_const_splat(a) simde_wasm_u8x16_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i16x8_splat (int16_t a) { @@ -707,6 +1093,52 @@ simde_wasm_i16x8_splat (int16_t a) { #define wasm_i16x8_splat(a) simde_wasm_i16x8_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u16x8_splat (uint16_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u16x8_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vdupq_n_u16(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u16x8_splat(a) simde_wasm_u16x8_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i16x8_const_splat(a) wasm_i16x8_const_splat((a)) +#else + #define simde_wasm_i16x8_const_splat(a) simde_wasm_i16x8_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i16x8_const_splat(a) simde_wasm_i16x8_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u16x8_const_splat(a) wasm_u16x8_const_splat((a)) +#else + #define simde_wasm_u16x8_const_splat(a) simde_wasm_u16x8_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u16x8_const_splat(a) simde_wasm_u16x8_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i32x4_splat (int32_t a) { @@ -735,6 +1167,52 @@ simde_wasm_i32x4_splat (int32_t a) { #define wasm_i32x4_splat(a) simde_wasm_i32x4_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u32x4_splat (uint32_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u32x4_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vdupq_n_u32(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u32 = vec_splats(a); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_splat(a) simde_wasm_u32x4_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i32x4_const_splat(a) wasm_i32x4_const_splat((a)) +#else + #define simde_wasm_i32x4_const_splat(a) simde_wasm_i32x4_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i32x4_const_splat(a) simde_wasm_i32x4_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u32x4_const_splat(a) wasm_u32x4_const_splat((a)) +#else + #define simde_wasm_u32x4_const_splat(a) simde_wasm_u32x4_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_const_splat(a) simde_wasm_u32x4_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_splat (int64_t a) { @@ -763,6 +1241,52 @@ simde_wasm_i64x2_splat (int64_t a) { #define wasm_i64x2_splat(a) simde_wasm_i64x2_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u64x2_splat (uint64_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u64x2_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) + r_.sse_m128i = _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vdupq_n_u64(a); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u64 = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_splat(a) simde_wasm_u64x2_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i64x2_const_splat(a) wasm_i64x2_const_splat((a)) +#else + #define simde_wasm_i64x2_const_splat(a) simde_wasm_i64x2_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u64x2_const_splat(a) wasm_u64x2_const_splat((a)) +#else + #define simde_wasm_u64x2_const_splat(a) simde_wasm_u64x2_splat(a) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_f32x4_splat (simde_float32 a) { @@ -993,6 +1517,36 @@ simde_wasm_u16x8_extract_lane (simde_v128_t a, const int lane) { #define wasm_u16x8_extract_lane(a, lane) simde_wasm_u16x8_extract_lane((a), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_wasm_u32x4_extract_lane (simde_v128_t a, const int lane) { + simde_v128_private a_ = simde_v128_to_private(a); + return a_.u32[lane & 3]; +} +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u32x4_extract_lane(a, lane) HEDLEY_STATIC_CAST(uint32_t, wasm_u32x4_extract_lane((a), (lane))) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES) + #define simde_wasm_u32x4_extract_lane(a, lane) vgetq_lane_u32(simde_v128_to_neon_u32(a), (lane) & 3) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_extract_lane(a, lane) simde_wasm_u32x4_extract_lane((a), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_wasm_u64x2_extract_lane (simde_v128_t a, const int lane) { + simde_v128_private a_ = simde_v128_to_private(a); + return a_.u64[lane & 1]; +} +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u64x2_extract_lane(a, lane) HEDLEY_STATIC_CAST(uint64_t, wasm_u64x2_extract_lane((a), (lane))) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES) + #define simde_wasm_u64x2_extract_lane(a, lane) vgetq_lane_u64(simde_v128_to_neon_u64(a), (lane) & 1) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_extract_lane(a, lane) simde_wasm_u64x2_extract_lane((a), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32 simde_wasm_f32x4_extract_lane (simde_v128_t a, const int lane) { @@ -3115,20 +3669,10 @@ simde_wasm_f32x4_abs (simde_v128_t a) { r_.neon_f32 = vabsq_f32(a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_abs(a_.altivec_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < SIMDE_FLOAT32_C(0.0)); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( - (HEDLEY_REINTERPRET_CAST(__typeof__(m), -a_.f32) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & ~m) - ) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < SIMDE_FLOAT32_C(0.0)) ? -a_.f32[i] : a_.f32[i]; + r_.f32[i] = simde_math_signbit(a_.f32[i]) ? -a_.f32[i] : a_.f32[i]; } #endif @@ -3155,20 +3699,10 @@ simde_wasm_f64x2_abs (simde_v128_t a) { r_.neon_f64 = vabsq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_abs(a_.altivec_f64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 < SIMDE_FLOAT64_C(0.0)); - r_.f64 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f64), - ( - (HEDLEY_REINTERPRET_CAST(__typeof__(m), -a_.f64) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & ~m) - ) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < SIMDE_FLOAT64_C(0.0)) ? -a_.f64[i] : a_.f64[i]; + r_.f64[i] = simde_math_signbit(a_.f64[i]) ? -a_.f64[i] : a_.f64[i]; } #endif @@ -3579,9 +4113,9 @@ simde_wasm_i8x16_shl (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, count))); + r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_sl(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_i8 = vec_sl(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i8 = a_.i8 << (count & 7); #else @@ -3611,9 +4145,9 @@ simde_wasm_i16x8_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count))); + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sl(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_i16 = vec_sl(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i16 = a_.i16 << (count & 15); #else @@ -3643,9 +4177,9 @@ simde_wasm_i32x4_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count))); + r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_i32 = vec_sl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i32 = a_.i32 << (count & 31); #else @@ -3666,6 +4200,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_i64x2_shl(a, count); #else simde_v128_private @@ -3675,9 +4212,9 @@ simde_wasm_i64x2_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, count))); + r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_i64 = vec_sl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i64 = a_.i64 << (count & 63); #else @@ -3707,9 +4244,9 @@ simde_wasm_i8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -count))); + r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_sra(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_i8 = vec_sra(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i8 = a_.i8 >> (count & 7); #else @@ -3739,9 +4276,9 @@ simde_wasm_i16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sra_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -count))); + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i16 = a_.i16 >> (count & 15); #else @@ -3771,9 +4308,9 @@ simde_wasm_i32x4_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_sra_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -count))); + r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i32 = a_.i32 >> (count & 31); #else @@ -3794,6 +4331,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_i64x2_shr(a, count); #else simde_v128_private @@ -3803,9 +4343,9 @@ simde_wasm_i64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_sra_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -count))); + r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(-HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i64 = a_.i64 >> (count & 63); #else @@ -3833,9 +4373,9 @@ simde_wasm_u8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -count))); + r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_sr(a_.altivec_u8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_u8 = vec_sr(a_.altivec_u8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u8 = a_.u8 >> (count & 7); #else @@ -3865,9 +4405,9 @@ simde_wasm_u16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_srl_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -count))); + r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_u16 = vec_sr(a_.altivec_u16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u16 = a_.u16 >> (count & 15); #else @@ -3897,9 +4437,9 @@ simde_wasm_u32x4_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_srl_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -count))); + r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(-HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_u32 = vec_sr(a_.altivec_u32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u32 = a_.u32 >> (count & 31); #else @@ -3920,6 +4460,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_u64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_u64x2_shr(a, count); #else simde_v128_private @@ -3929,9 +4472,9 @@ simde_wasm_u64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_srl_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -count))); + r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_u64 = vec_sr(a_.altivec_u64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u64 = a_.u64 >> (count & 63); #else @@ -4317,12 +4860,6 @@ simde_wasm_i16x8_mul (simde_v128_t a, simde_v128_t b) { r_.sse_m128i = _mm_mullo_epi16(a_.sse_m128i, b_.sse_m128i); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = - vec_pack( - vec_mule(a_.altivec_i16, b_.altivec_i16), - vec_mulo(a_.altivec_i16, b_.altivec_i16) - ); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.i16 = a_.i16 * b_.i16; #else @@ -4471,26 +5008,6 @@ simde_wasm_i16x8_q15mulr_sat (simde_v128_t a, simde_v128_t b) { /* https://github.com/WebAssembly/simd/pull/365 */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vqrdmulhq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i y = _mm_mulhrs_epi16(a_.sse_m128i, b_.sse_m128i); - __m128i tmp = _mm_cmpeq_epi16(y, _mm_set1_epi16(INT16_MAX)); - r_.sse_m128i = _mm_xor_si128(y, tmp); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i prod_lo = _mm_mullo_epi16(a_.sse_m128i, b_.sse_m128i); - const __m128i prod_hi = _mm_mulhi_epi16(a_.sse_m128i, b_.sse_m128i); - const __m128i tmp = - _mm_add_epi16( - _mm_avg_epu16( - _mm_srli_epi16(prod_lo, 14), - _mm_setzero_si128() - ), - _mm_add_epi16(prod_hi, prod_hi) - ); - r_.sse_m128i = - _mm_xor_si128( - tmp, - _mm_cmpeq_epi16(_mm_set1_epi16(INT16_MAX), tmp) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4747,43 +5264,22 @@ simde_wasm_f32x4_min (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128 = _mm_blendv_ps( - _mm_set1_ps(SIMDE_MATH_NANF), - _mm_min_ps(a_.sse_m128, b_.sse_m128), - _mm_cmpord_ps(a_.sse_m128, b_.sse_m128)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128 m = _mm_cmpord_ps(a_.sse_m128, b_.sse_m128); - r_.sse_m128 = - _mm_or_ps( - _mm_and_ps(m, _mm_min_ps(a_.sse_m128, b_.sse_m128)), - _mm_andnot_ps(m, _mm_set1_ps(SIMDE_MATH_NANF)) - ); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) condition; - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) a_lt_b = - vec_cmpgt(b_.altivec_f32, a_.altivec_f32); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - condition = vec_orc(a_lt_b, vec_cmpeq(a_.altivec_f32, a_.altivec_f32)); - #else - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) a_not_nan = - vec_cmpeq(a_.altivec_f32, a_.altivec_f32); - condition = vec_or(a_lt_b, vec_nor(a_not_nan, a_not_nan)); - #endif - - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - condition - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L202 + simde_v128_private scratch; + scratch.sse_m128 = a_.sse_m128; + scratch.sse_m128 = _mm_min_ps(scratch.sse_m128, b_.sse_m128); + r_.sse_m128 = b_.sse_m128; + r_.sse_m128 = _mm_min_ps(r_.sse_m128, a_.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128 = _mm_cmpunord_ps(r_.sse_m128, scratch.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128i = _mm_srli_epi32(r_.sse_m128i, 10); + r_.sse_m128 = _mm_andnot_ps(r_.sse_m128, scratch.sse_m128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_math_isnan(a_.f32[i]) || (a_.f32[i] < b_.f32[i])) ? a_.f32[i] : b_.f32[i]; + r_.f32[i] = SIMDE_WASM_SIMD128_FMINF(a_.f32[i], b_.f32[i]); } #endif @@ -4805,34 +5301,22 @@ simde_wasm_f64x2_min (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128d = _mm_blendv_pd( - _mm_set1_pd(SIMDE_MATH_NAN), - _mm_min_pd(a_.sse_m128d, b_.sse_m128d), - _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128d m = _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d); - r_.sse_m128d = - _mm_or_pd( - _mm_and_pd(m, _mm_min_pd(a_.sse_m128d, b_.sse_m128d)), - _mm_andnot_pd(m, _mm_set1_pd(SIMDE_MATH_NAN)) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_orc( - vec_cmpgt(b_.altivec_f64, a_.altivec_f64), - vec_cmpeq(a_.altivec_f64, a_.altivec_f64) - ) - ); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L263 + simde_v128_private scratch; + scratch.sse_m128d = a_.sse_m128d; + scratch.sse_m128d = _mm_min_pd(scratch.sse_m128d, b_.sse_m128d); + r_.sse_m128d = b_.sse_m128d; + r_.sse_m128d = _mm_min_pd(r_.sse_m128d, a_.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128d = _mm_cmpunord_pd(r_.sse_m128d, scratch.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128i = _mm_srli_epi64(r_.sse_m128i, 13); + r_.sse_m128d = _mm_andnot_pd(r_.sse_m128d, scratch.sse_m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_math_isnan(a_.f64[i]) || (a_.f64[i] < b_.f64[i])) ? a_.f64[i] : b_.f64[i]; + r_.f64[i] = SIMDE_WASM_SIMD128_FMIN(a_.f64[i], b_.f64[i]); } #endif @@ -5077,59 +5561,23 @@ simde_wasm_f32x4_max (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128 = _mm_blendv_ps( - _mm_set1_ps(SIMDE_MATH_NANF), - _mm_max_ps(a_.sse_m128, b_.sse_m128), - _mm_cmpord_ps(a_.sse_m128, b_.sse_m128)); - #elif defined(SIMDE_X86_SSE_NATIVE) - __m128 m = _mm_or_ps(_mm_cmpneq_ps(a_.sse_m128, a_.sse_m128), _mm_cmpgt_ps(a_.sse_m128, b_.sse_m128)); - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.ssse_m128 = _mm_blendv_ps(b_.sse_m128, a_.sse_m128, m); - #else - r_.sse_m128 = - _mm_or_ps( - _mm_and_ps(m, a_.sse_m128), - _mm_andnot_ps(m, b_.sse_m128) - ); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - vec_orc( - vec_cmpgt(a_.altivec_f32, b_.altivec_f32), - vec_cmpeq(a_.altivec_f32, a_.altivec_f32) - ) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) cmpres = vec_cmpeq(a_.altivec_f32, a_.altivec_f32); - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - vec_or( - vec_cmpgt(a_.altivec_f32, b_.altivec_f32), - vec_nor(cmpres, cmpres) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - int32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), (a_.f32 != a_.f32) | (a_.f32 > b_.f32)); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( - ( m & HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32)) | - (~m & HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32)) - ) - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L231 + simde_v128_private scratch; + scratch.sse_m128 = a_.sse_m128; + scratch.sse_m128 = _mm_max_ps(scratch.sse_m128, b_.sse_m128); + r_.sse_m128 = b_.sse_m128; + r_.sse_m128 = _mm_max_ps(r_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_xor_ps(r_.sse_m128, scratch.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + scratch.sse_m128 = _mm_sub_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128 = _mm_cmpunord_ps(r_.sse_m128, scratch.sse_m128); + r_.sse_m128i = _mm_srli_epi32(r_.sse_m128i, 10); + r_.sse_m128 = _mm_andnot_ps(r_.sse_m128, scratch.sse_m128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_math_isnan(a_.f32[i]) || (a_.f32[i] > b_.f32[i])) ? a_.f32[i] : b_.f32[i]; + r_.f32[i] = SIMDE_WASM_SIMD128_FMAXF(a_.f32[i], b_.f32[i]); } #endif @@ -5151,59 +5599,23 @@ simde_wasm_f64x2_max (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128d = _mm_blendv_pd( - _mm_set1_pd(SIMDE_MATH_NAN), - _mm_max_pd(a_.sse_m128d, b_.sse_m128d), - _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128d m = _mm_or_pd(_mm_cmpneq_pd(a_.sse_m128d, a_.sse_m128d), _mm_cmpgt_pd(a_.sse_m128d, b_.sse_m128d)); - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.ssse_m128d = _mm_blendv_pd(b_.sse_m128d, a_.sse_m128d, m); - #else - r_.sse_m128d = - _mm_or_pd( - _mm_and_pd(m, a_.sse_m128d), - _mm_andnot_pd(m, b_.sse_m128d) - ); - #endif - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_orc( - vec_cmpgt(a_.altivec_f64, b_.altivec_f64), - vec_cmpeq(a_.altivec_f64, a_.altivec_f64) - ) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL long long) cmpres = vec_cmpeq(a_.altivec_f64, a_.altivec_f64); - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_or( - vec_cmpgt(a_.altivec_f64, b_.altivec_f64), - vec_nor(cmpres, cmpres) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - int64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), (a_.f64 != a_.f64) | (a_.f64 > b_.f64)); - r_.f64 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f64), - ( - ( m & HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64)) | - (~m & HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64)) - ) - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L301 + simde_v128_private scratch; + scratch.sse_m128d = a_.sse_m128d; + scratch.sse_m128d = _mm_max_pd(scratch.sse_m128d, b_.sse_m128d); + r_.sse_m128d = b_.sse_m128d; + r_.sse_m128d = _mm_max_pd(r_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_xor_pd(r_.sse_m128d, scratch.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + scratch.sse_m128d = _mm_sub_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128d = _mm_cmpunord_pd(r_.sse_m128d, scratch.sse_m128d); + r_.sse_m128i = _mm_srli_epi64(r_.sse_m128i, 13); + r_.sse_m128d = _mm_andnot_pd(r_.sse_m128d, scratch.sse_m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_math_isnan(a_.f64[i]) || (a_.f64[i] > b_.f64[i])) ? a_.f64[i] : b_.f64[i]; + r_.f64[i] = SIMDE_WASM_SIMD128_FMAX(a_.f64[i], b_.f64[i]); } #endif @@ -5373,6 +5785,10 @@ simde_wasm_u8x16_avgr (simde_v128_t a, simde_v128_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128i = _mm_avg_epu8(a_.sse_m128i, b_.sse_m128i); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vrhaddq_u8(a_.neon_u8, b_.neon_u8); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -5400,6 +5816,10 @@ simde_wasm_u16x8_avgr (simde_v128_t a, simde_v128_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128i = _mm_avg_epu16(a_.sse_m128i, b_.sse_m128i); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vrhaddq_u16(a_.neon_u16, b_.neon_u16); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -5619,7 +6039,7 @@ simde_wasm_f64x2_pmin (simde_v128_t a, simde_v128_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128d = _mm_min_pd(b_.sse_m128d, a_.sse_m128d); #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vminq_f64(a_.neon_f64, b_.neon_f64); + r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) @@ -5630,11 +6050,11 @@ simde_wasm_f64x2_pmin (simde_v128_t a, simde_v128_t b) { a_.neon_f64 ); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = + r_.altivec_f64 = vec_sel( - a_.altivec_f32, - b_.altivec_f32, - vec_cmpgt(a_.altivec_f32, b_.altivec_f32) + a_.altivec_f64, + b_.altivec_f64, + vec_cmpgt(a_.altivec_f64, b_.altivec_f64) ); #else SIMDE_VECTORIZE @@ -5663,7 +6083,7 @@ simde_wasm_f32x4_pmax (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) r_.sse_m128 = _mm_max_ps(b_.sse_m128, a_.sse_m128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vbslq_f32(vcltq_f32(a_.neon_f32, b_.neon_f32), b_.neon_f32, a_.neon_f32); @@ -7934,7 +8354,9 @@ simde_wasm_v128_load16_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i16[lane] = *HEDLEY_REINTERPRET_CAST(const int16_t *, a); + int16_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int16_t)); + a_.i16[lane] = tmp; return simde_v128_from_private(a_); } @@ -7952,7 +8374,9 @@ simde_wasm_v128_load32_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i32[lane] = *HEDLEY_REINTERPRET_CAST(const int32_t *, a); + int32_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int32_t)); + a_.i32[lane] = tmp; return simde_v128_from_private(a_); } @@ -7970,7 +8394,9 @@ simde_wasm_v128_load64_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i64[lane] = *HEDLEY_REINTERPRET_CAST(const int64_t *, a); + int64_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int64_t)); + a_.i64[lane] = tmp; return simde_v128_from_private(a_); } @@ -8183,7 +8609,7 @@ simde_wasm_i32x4_trunc_sat_f32x4 (simde_v128_t a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.f32); + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #elif defined(SIMDE_X86_SSE2_NATIVE) const __m128i i32_max_mask = _mm_castps_si128(_mm_cmpgt_ps(a_.sse_m128, _mm_set1_ps(SIMDE_FLOAT32_C(2147483520.0)))); const __m128 clamped = _mm_max_ps(a_.sse_m128, _mm_set1_ps(HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))); @@ -8205,7 +8631,7 @@ simde_wasm_i32x4_trunc_sat_f32x4 (simde_v128_t a) { ); #endif r_.sse_m128i = _mm_and_si128(r_.sse_m128i, _mm_castps_si128(_mm_cmpord_ps(a_.sse_m128, a_.sse_m128))); - #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); const __typeof__(a_.f32) max_representable = { SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0) }; @@ -8580,7 +9006,7 @@ simde_wasm_f32x4_ceil (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_ceilf(a_.f32[i])); } #endif @@ -8603,30 +9029,6 @@ simde_wasm_f64x2_ceil (simde_v128_t a) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.sse_m128d = _mm_round_pd(a_.sse_m128d, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/WebAssembly/simd/pull/232 */ - - const __m128d all_but_sign_set = _mm_castsi128_pd(_mm_set1_epi64x(INT64_C(0x7FFFFFFFFFFFFFFF))); - /* https://stackoverflow.com/a/55077612 explains this a bit */ - const __m128d bignum = _mm_set1_pd(4.50359962737049600000e+15); - const __m128d sign_cleared = _mm_and_pd(a_.sse_m128d, all_but_sign_set); - - __m128d mask = - _mm_and_pd( - _mm_cmpnle_pd(bignum, sign_cleared), - all_but_sign_set - ); - const __m128d tmp = - _mm_or_pd( - _mm_andnot_pd(mask, a_.sse_m128d), - _mm_and_pd (mask, _mm_sub_pd(_mm_add_pd(sign_cleared, bignum), bignum)) - ); - - r_.sse_m128d = - _mm_add_pd( - tmp, - _mm_and_pd(_mm_and_pd(_mm_cmplt_pd(tmp, a_.sse_m128d), all_but_sign_set), _mm_set1_pd(1.0)) - ); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndpq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) @@ -8634,7 +9036,7 @@ simde_wasm_f64x2_ceil (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_ceil(a_.f64[i])); } #endif @@ -8689,7 +9091,7 @@ simde_wasm_f32x4_floor (simde_v128_t a) { ) ); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndmq_f32(a_.f32); + r_.neon_f32 = vrndmq_f32(a_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) const int32x4_t input_as_int = vcvtq_s32_f32(a_.f32); const float32x4_t input_truncated = vcvtq_f32_s32(input_as_int); @@ -8722,7 +9124,7 @@ simde_wasm_f32x4_floor (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_floorf(a_.f32[i])); } #endif @@ -8745,7 +9147,7 @@ simde_wasm_f64x2_floor (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_floor(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8769,7 +9171,7 @@ simde_wasm_f32x4_trunc (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_truncf(a_.f32[i])); } return simde_v128_from_private(r_); @@ -8791,7 +9193,7 @@ simde_wasm_f64x2_trunc (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_trunc(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8815,7 +9217,7 @@ simde_wasm_f32x4_nearest (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_nearbyintf(a_.f32[i])); } return simde_v128_from_private(r_); @@ -8837,7 +9239,7 @@ simde_wasm_f64x2_nearest (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_nearbyint(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8868,7 +9270,7 @@ simde_wasm_f32x4_sqrt (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_sqrtf(a_.f32[i])); } #endif @@ -8889,7 +9291,7 @@ simde_wasm_f64x2_sqrt (simde_v128_t a) { a_ = simde_v128_to_private(a), r_; - #if defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128d = _mm_sqrt_pd(a_.sse_m128d); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsqrtq_f64(a_.neon_f64); @@ -8898,7 +9300,7 @@ simde_wasm_f64x2_sqrt (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_sqrt(a_.f64[i])); } #endif diff --git a/x86/aes.h b/x86/aes.h new file mode 100644 index 000000000..1d5b04926 --- /dev/null +++ b/x86/aes.h @@ -0,0 +1,417 @@ +/* MIT License + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(SIMDE_X86_AES_H) +#define SIMDE_X86_AES_H + +/* + * Advanced Encryption Standard + * @author Dani Huertas + * @email huertas.dani@gmail.com + * + * Based on the document FIPS PUB 197 + */ + +#include "sse2.h" + +/* + * Multiplication in GF(2^8) + * http://en.wikipedia.org/wiki/Finite_field_arithmetic + * Irreducible polynomial m(x) = x8 + x4 + x3 + x + 1 + * + * NOTE: This function can be easily replaced with a look up table for a speed + * boost, at the expense of an increase in memory size. + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t gmult(uint8_t a, uint8_t b) { + uint8_t p = 0, i = 0, hbs = 0; + + for (i = 0; i < 8; i++) { + if (b & 1) { + p ^= a; + } + + hbs = a & 0x80; + a <<= 1; + if (hbs) a ^= 0x1b; // 0000 0001 0001 1011 + b >>= 1; + } + + return (uint8_t)p; +} + */ + +#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +#include "../simde-aes.h" + +/* + * Transformation in the Cipher and Inverse Cipher in which a Round + * Key is added to the State using an XOR operation. The length of a + * Round Key equals the size of the State (i.e., for Nb = 4, the Round + * Key length equals 128 bits/16 bytes). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_add_round_key(uint8_t *state, simde__m128i_private w, uint8_t r) { + + int Nb = simde_x_aes_Nb; + uint8_t c; + + for (c = 0; c < Nb; c++) { + state[Nb*0+c] = state[Nb*0+c]^w.u8[4*Nb*r+4*c+0]; + state[Nb*1+c] = state[Nb*1+c]^w.u8[4*Nb*r+4*c+1]; + state[Nb*2+c] = state[Nb*2+c]^w.u8[4*Nb*r+4*c+2]; + state[Nb*3+c] = state[Nb*3+c]^w.u8[4*Nb*r+4*c+3]; + } +} + +/* + * Transformation in the Cipher that takes all of the columns of the + * State and mixes their data (independently of one another) to + * produce new columns. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_mix_columns(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x02, 0x01, 0x01, 0x03}; // a(x) = {02} + {01}x + {01}x2 + {03}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = state[Nb*i+j]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(0, col, res); + + for (i = 0; i < 4; i++) { + state[Nb*i+j] = res[i]; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * MixColumns(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_mix_columns(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = state[Nb*i+j]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + state[Nb*i+j] = res[i]; + } + } +} + +/* + * Transformation in the Cipher that processes the State by cyclically + * shifting the last three rows of the State by different offsets. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_shift_rows(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, k, s, tmp; + + for (i = 1; i < 4; i++) { + // shift(1,4)=1; shift(2,4)=2; shift(3,4)=3 + // shift(r, 4) = r; + s = 0; + while (s < i) { + tmp = state[Nb*i+0]; + + for (k = 1; k < Nb; k++) { + state[Nb*i+k-1] = state[Nb*i+k]; + } + + state[Nb*i+Nb-1] = tmp; + s++; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * ShiftRows(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_shift_rows(uint8_t *state) { + + uint8_t Nb = simde_x_aes_Nb; + uint8_t i, k, s, tmp; + + for (i = 1; i < 4; i++) { + s = 0; + while (s < i) { + tmp = state[Nb*i+Nb-1]; + + for (k = Nb-1; k > 0; k--) { + state[Nb*i+k] = state[Nb*i+k-1]; + } + + state[Nb*i+0] = tmp; + s++; + } + } +} + +/* + * Transformation in the Cipher that processes the State using a non + * linear byte substitution table (S-box) that operates on each of the + * State bytes independently. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_sub_bytes(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + // s_box row: yyyy ---- + // s_box col: ---- xxxx + // s_box[16*(yyyy) + xxxx] == s_box[yyyyxxxx] + state[Nb*i+j] = simde_x_aes_s_box[state[Nb*i+j]]; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * SubBytes(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_sub_bytes(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = simde_x_aes_inv_s_box[state[Nb*i+j]]; + } + } +} + +/* + * Performs the AES cipher operation + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_enc(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { + + int Nb = simde_x_aes_Nb; + uint8_t state[4*simde_x_aes_Nb]; + uint8_t r = 0, i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = in.u8[i+4*j]; + } + } + + simde_x_aes_sub_bytes(state); + simde_x_aes_shift_rows(state); + + if (!is_last) + simde_x_aes_mix_columns(state); + + simde_x_aes_add_round_key(state, w, r); + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + out->u8[i+4*j] = state[Nb*i+j]; + } + } +} + +/* + * Performs the AES inverse cipher operation + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_dec(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { + + int Nb = simde_x_aes_Nb; + uint8_t state[4*simde_x_aes_Nb]; + uint8_t r = 0, i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = in.u8[i+4*j]; + } + } + + simde_x_aes_inv_shift_rows(state); + simde_x_aes_inv_sub_bytes(state); + + if (!is_last) + simde_x_aes_inv_mix_columns(state); + + simde_x_aes_add_round_key(state, w, r); + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + out->u8[i+4*j] = state[Nb*i+j]; + } + } +} +#endif // if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesenc_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesenc_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesmcq_u8(vaeseq_u8(a_.neon_u8, vdupq_n_u8(0))), + round_key_.neon_u8); + #else + simde_x_aes_enc(a_, &result_, round_key_, 0); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesenc_si128(a, b) simde_mm_aesenc_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesdec_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesdec_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesimcq_u8(vaesdq_u8(a_.neon_u8, vdupq_n_u8(0))), + round_key_.neon_u8); + #else + simde_x_aes_dec(a_, &result_, round_key_, 0); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesdec_si128(a, b) simde_mm_aesdec_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesenclast_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesenclast_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = vaeseq_u8(a_.neon_u8, vdupq_n_u8(0)); + result_.neon_i32 = veorq_s32(result_.neon_i32, round_key_.neon_i32); // _mm_xor_si128 + #else + simde_x_aes_enc(a_, &result_, round_key_, 1); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesenclast_si128(a, b) simde_mm_aesenclast_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesdeclast_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesdeclast_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesdq_u8(a_.neon_u8, vdupq_n_u8(0)), + round_key_.neon_u8); + #else + simde_x_aes_dec(a_, &result_, round_key_, 1); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesdeclast_si128(a, b) simde_mm_aesdeclast_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesimc_si128(simde__m128i a) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesimc_si128(a); + #else + simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); + simde__m128i_private a_ = simde__m128i_to_private(a); + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = vaesimcq_u8(a_.neon_u8); + #else + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = a_.u8[Nb*j+i]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + result_.u8[Nb*j+i] = res[i]; + } + } + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesimc_si128(a) simde_mm_aesimc_si128(a) +#endif + +#undef simde_x_aes_Nb + +#endif /* !defined(SIMDE_X86_AES_H) */ diff --git a/x86/avx.h b/x86/avx.h index a10974c92..f09a52cf5 100644 --- a/x86/avx.h +++ b/x86/avx.h @@ -30,6 +30,7 @@ #define SIMDE_X86_AVX_H #include "sse4.2.h" +#include "../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -90,6 +91,9 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long) altivec_i64[2]; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_TO_32 __m256i i256; + SIMDE_ALIGN_TO_32 __m256 f256; #endif } simde__m256_private; @@ -148,6 +152,9 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_TO_32 __m256i i256; + SIMDE_ALIGN_TO_32 __m256d d256; #endif } simde__m256d_private; @@ -165,6 +172,11 @@ typedef union { SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_32 simde_float16 f16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; + #endif SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; @@ -184,6 +196,7 @@ typedef union { SIMDE_ALIGN_TO_32 simde_int128 i128[2]; SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; #endif + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; SIMDE_ALIGN_TO_32 simde_float32 f32[8]; SIMDE_ALIGN_TO_32 simde_float64 f64[4]; #endif @@ -206,10 +219,12 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_TO_32 __m256i i256; #endif } simde__m256i_private; -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_LOONGARCH_LASX_NATIVE) typedef __m256 simde__m256; typedef __m256i simde__m256i; typedef __m256d simde__m256d; @@ -374,6 +389,8 @@ simde__m256d simde_mm256_castps_pd (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castps_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); #endif @@ -388,6 +405,8 @@ simde__m256i simde_mm256_castps_si256 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castps_si256(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256i)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); #endif @@ -402,6 +421,8 @@ simde__m256d simde_mm256_castsi256_pd (simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castsi256_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); #endif @@ -416,6 +437,8 @@ simde__m256 simde_mm256_castsi256_ps (simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castsi256_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); #endif @@ -430,6 +453,8 @@ simde__m256 simde_mm256_castpd_ps (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castpd_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); #endif @@ -444,6 +469,8 @@ simde__m256i simde_mm256_castpd_si256 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_castpd_si256(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256i)a; #else return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); #endif @@ -458,6 +485,8 @@ simde__m256i simde_mm256_setzero_si256 (void) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_setzero_si256(); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_w(0); #else simde__m256i_private r_; @@ -484,6 +513,8 @@ simde__m256 simde_mm256_setzero_ps (void) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_setzero_ps(); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)__lasx_xvreplgr2vr_w(0); #else return simde_mm256_castsi256_ps(simde_mm256_setzero_si256()); #endif @@ -498,6 +529,8 @@ simde__m256d simde_mm256_setzero_pd (void) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_setzero_pd(); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvreplgr2vr_d(0); #else return simde_mm256_castsi256_pd(simde_mm256_setzero_si256()); #endif @@ -514,7 +547,9 @@ simde_x_mm256_not_ps(simde__m256 a) { r_, a_ = simde__m256_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvnor_v(a_.i256, a_.i256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = ~a_.i32; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]); @@ -549,7 +584,9 @@ simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) { b_ = simde__m256_to_private(b), mask_ = simde__m256_to_private(mask); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, mask_.i256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); @@ -572,7 +609,9 @@ simde_x_mm256_not_pd(simde__m256d a) { r_, a_ = simde__m256d_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvnor_v(a_.i256, a_.i256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = ~a_.i64; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]); @@ -607,7 +646,9 @@ simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) { b_ = simde__m256d_to_private(b), mask_ = simde__m256d_to_private(mask); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, mask_.i256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); @@ -628,7 +669,9 @@ simde__m256i simde_x_mm256_setone_si256 (void) { simde__m256i_private r_; -#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvreplgr2vr_w(-1); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) __typeof__(r_.i32f) rv = { 0, }; r_.i32f = ~rv; #elif defined(SIMDE_X86_AVX2_NATIVE) @@ -671,6 +714,14 @@ simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256i) int8_t data[32] = { + e0, e1, e2, e3, e4, e5, e6, e7, + e8, e9, e10, e11, e12, e13, e14, e15, + e16, e17, e18, e19, e20, e21, e22, e23, + e24, e25, e26, e27, e28, e29, e30, e31 + }; + return __lasx_xvld(data, 0); #else simde__m256i_private r_; @@ -734,6 +785,11 @@ simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12, #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256i) int16_t data[16] = { + e0, e1, e2, e3, e4, e5, e6, e7, + e8, e9, e10, e11, e12, e13, e14, e15}; + return __lasx_xvld(data, 0); #else simde__m256i_private r_; @@ -774,6 +830,10 @@ simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256i) int32_t data[8] = { + e0, e1, e2, e3, e4, e5, e6, e7}; + return __lasx_xvld(data, 0); #else simde__m256i_private r_; @@ -805,6 +865,9 @@ simde__m256i simde_mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_epi64x(e3, e2, e1, e0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256i) int64_t data[4] = {e0, e1, e2, e3}; + return __lasx_xvld(data, 0); #else simde__m256i_private r_; @@ -910,6 +973,9 @@ simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4, #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4), HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256i) uint32_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + return __lasx_xvld(data, 0); #else simde__m256i_private r_; @@ -956,6 +1022,9 @@ simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_ #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + SIMDE_ALIGN_LIKE_32(__m256) simde_float32 data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; + r_.i256 = __lasx_xvld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -1098,6 +1167,8 @@ simde__m256i simde_mm256_set1_epi8 (int8_t a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_epi8(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_b(a); #else simde__m256i_private r_; @@ -1124,6 +1195,8 @@ simde__m256i simde_mm256_set1_epi16 (int16_t a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_epi16(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_h(a); #else simde__m256i_private r_; @@ -1150,6 +1223,8 @@ simde__m256i simde_mm256_set1_epi32 (int32_t a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_w(a); #else simde__m256i_private r_; @@ -1176,6 +1251,8 @@ simde__m256i simde_mm256_set1_epi64x (int64_t a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_epi64x(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_d(a); #else simde__m256i_private r_; @@ -1202,6 +1279,8 @@ simde__m256 simde_mm256_set1_ps (simde_float32 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)__lasx_xvldrepl_w(&a, 0); #else simde__m256_private r_; @@ -1228,6 +1307,8 @@ simde__m256d simde_mm256_set1_pd (simde_float64 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set1_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvldrepl_d(&a, 0); #else simde__m256d_private r_; @@ -1252,109 +1333,125 @@ simde_mm256_set1_pd (simde_float64 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvpickev_h(b, a); #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i]; - r_.i16[i + quarter_point] = b_.i16[2 * i]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i]; - } - #endif + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); - return simde__m256i_from_private(r_); + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30); + #else + const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; + const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; + for (size_t i = 0 ; i < quarter_point ; i++) { + r_.i16[i] = a_.i16[2 * i]; + r_.i16[i + quarter_point] = b_.i16[2 * i]; + r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i]; + r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i]; + } + #endif + + return simde__m256i_from_private(r_); + #endif } SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvpickod_h(b, a); #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i + 1]; - r_.i16[i + quarter_point] = b_.i16[2 * i + 1]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1]; - } - #endif + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); - return simde__m256i_from_private(r_); + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31); + #else + const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; + const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; + for (size_t i = 0 ; i < quarter_point ; i++) { + r_.i16[i] = a_.i16[2 * i + 1]; + r_.i16[i + quarter_point] = b_.i16[2 * i + 1]; + r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1]; + r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1]; + } + #endif + + return simde__m256i_from_private(r_); + #endif } SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvpickev_w(b, a); #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i]; - r_.i32[i + quarter_point] = b_.i32[2 * i]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i]; - } - #endif + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); - return simde__m256i_from_private(r_); + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14); + #else + const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; + const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; + for (size_t i = 0 ; i < quarter_point ; i++) { + r_.i32[i] = a_.i32[2 * i]; + r_.i32[i + quarter_point] = b_.i32[2 * i]; + r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i]; + r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i]; + } + #endif + + return simde__m256i_from_private(r_); + #endif } SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvpickod_w(b, a); #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i + 1]; - r_.i32[i + quarter_point] = b_.i32[2 * i + 1]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1]; - } - #endif + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); - return simde__m256i_from_private(r_); + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15); + #else + const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; + const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; + for (size_t i = 0 ; i < quarter_point ; i++) { + r_.i32[i] = a_.i32[2 * i + 1]; + r_.i32[i + quarter_point] = b_.i32[2 * i + 1]; + r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1]; + r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1]; + } + #endif + + return simde__m256i_from_private(r_); + #endif } SIMDE_FUNCTION_ATTRIBUTES @@ -1365,7 +1462,9 @@ simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvpickev_w(b_.i256, a_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -1392,7 +1491,9 @@ simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvpickod_w(b_.i256, a_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -1419,7 +1520,9 @@ simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvpickev_d(b_.i256, a_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -1446,7 +1549,9 @@ simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvpickod_d(b_.i256, a_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -1498,6 +1603,8 @@ simde__m256 simde_mm256_add_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_add_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfadd_s(a, b); #else simde__m256_private r_, @@ -1543,6 +1650,8 @@ simde__m256d simde_mm256_add_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_add_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfadd_d(a, b); #else simde__m256d_private r_, @@ -1588,6 +1697,9 @@ simde__m256 simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_addsub_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256 add_ = __lasx_xvfadd_s(a, b), sub_ = __lasx_xvfsub_s(a, b); + return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(sub_, add_, 0x11), add_, 0x33); #else simde__m256_private r_, @@ -1618,6 +1730,9 @@ simde__m256d simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_addsub_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256d add_ = __lasx_xvfadd_d(a, b), sub_ = __lasx_xvfsub_d(a, b); + return (simde__m256d)__lasx_xvextrins_d(__lasx_xvextrins_d(sub_, add_, 0x11), add_, 0x33); #else simde__m256d_private r_, @@ -1654,7 +1769,9 @@ simde_mm256_and_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvand_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1685,7 +1802,9 @@ simde_mm256_and_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvand_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1716,7 +1835,9 @@ simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1747,7 +1868,9 @@ simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1776,10 +1899,18 @@ simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8) a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i + mask = simde_mm256_set_epi32((imm8 >> 7) & 1, (imm8 >> 6) & 1, (imm8 >> 5) & 1, + (imm8 >> 4) & 1, (imm8 >> 3) & 1, (imm8 >> 2) & 1, (imm8 >> 1) & 1, (imm8 & 1)); + mask = __lasx_xvseqi_w(mask, 1); + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, mask); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; + } + #endif return simde__m256_from_private(r_); } @@ -1805,10 +1936,17 @@ simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8) a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i + mask = simde_mm256_set_epi64x((imm8 >> 3) & 1, (imm8 >> 2) & 1, (imm8 >> 1) & 1, (imm8 & 1)); + mask = __lasx_xvseqi_d(mask, 1); + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, mask); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; + } + #endif return simde__m256d_from_private(r_); } #if defined(SIMDE_X86_AVX_NATIVE) @@ -1839,6 +1977,9 @@ simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) { #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i m = __lasx_xvslti_w(mask_.i256, 0); + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, m); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1869,6 +2010,9 @@ simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) { #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i m = __lasx_xvslti_d(mask_.i256, 0); + r_.i256 = __lasx_xvbitsel_v(a_.i256, b_.i256, m); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { @@ -1889,6 +2033,8 @@ simde__m256d simde_mm256_broadcast_pd (simde__m128d const * mem_addr) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_broadcast_pd(mem_addr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvld(HEDLEY_REINTERPRET_CAST(simde_float64 const*, mem_addr), 0); #else simde__m256d_private r_; @@ -1912,9 +2058,14 @@ simde_mm256_broadcast_ps (simde__m128 const * mem_addr) { #else simde__m256_private r_; - simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr)); - r_.m128[0] = tmp; - r_.m128[1] = tmp; + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvld(mem_addr, 0); + r_.i256 = __lasx_xvpermi_q(r_.i256, r_.i256, 0x00); + #else + simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr)); + r_.m128[0] = tmp; + r_.m128[1] = tmp; + #endif return simde__m256_from_private(r_); #endif @@ -1929,6 +2080,8 @@ simde__m256d simde_mm256_broadcast_sd (simde_float64 const * a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_broadcast_sd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvldrepl_d(a, 0); #else return simde_mm256_set1_pd(*a); #endif @@ -1943,6 +2096,10 @@ simde__m128 simde_mm_broadcast_ss (simde_float32 const * a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm_broadcast_ss(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m128)__lsx_vldrepl_w(a, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_v128_load32_splat(a)); #else return simde_mm_set1_ps(*a); #endif @@ -1957,6 +2114,8 @@ simde__m256 simde_mm256_broadcast_ss (simde_float32 const * a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_broadcast_ss(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)__lasx_xvldrepl_w(a, 0); #else return simde_mm256_set1_ps(*a); #endif @@ -2074,7 +2233,11 @@ simde_mm256_round_ps (simde__m256 a, const int rounding) { simde__m256_private r_, a_ = simde__m256_to_private(a); - + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_STATEMENT_EXPR_) + for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_mm_round_ps, r_.m128[i], (HEDLEY_UNREACHABLE(), simde_mm_undefined_ps()), rounding, a_.m128[i]); + } + #else switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { #if defined(simde_math_nearbyintf) case SIMDE_MM_FROUND_CUR_DIRECTION: @@ -2084,42 +2247,50 @@ simde_mm256_round_ps (simde__m256 a, const int rounding) { break; #endif - #if defined(simde_math_roundf) case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.f256 = __lasx_xvfrintrne_s(a); + #elif defined(simde_math_roundf) for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_roundf(a_.f32[i]); } + #endif break; - #endif - #if defined(simde_math_floorf) case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.f256 = __lasx_xvfrintrm_s(a); + #elif defined(simde_math_floorf) for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_floorf(a_.f32[i]); } + #endif break; - #endif - #if defined(simde_math_ceilf) case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.f256 = __lasx_xvfrintrp_s(a); + #elif defined(simde_math_ceilf) for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_ceilf(a_.f32[i]); } + #endif break; - #endif - #if defined(simde_math_truncf) case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.f256 = __lasx_xvfrintrz_s(a); + #elif defined(simde_math_truncf) for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_truncf(a_.f32[i]); } + #endif break; - #endif default: HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_ps()); } - + #endif return simde__m256_from_private(r_); } #if defined(SIMDE_X86_AVX_NATIVE) @@ -2127,7 +2298,7 @@ simde_mm256_round_ps (simde__m256 a, const int rounding) { #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_round_ps_r_, \ + simde_mm256_round_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_round_ps_a_ = simde__m256_to_private(a); \ \ for (size_t simde_mm256_round_ps_i = 0 ; simde_mm256_round_ps_i < (sizeof(simde_mm256_round_ps_r_.m128) / sizeof(simde_mm256_round_ps_r_.m128[0])) ; simde_mm256_round_ps_i++) { \ @@ -2148,6 +2319,11 @@ simde_mm256_round_pd (simde__m256d a, const int rounding) { simde__m256d_private r_, a_ = simde__m256d_to_private(a); + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_STATEMENT_EXPR_) + for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_mm_round_pd, r_.m128d[i], (HEDLEY_UNREACHABLE(), simde_mm_undefined_pd()), rounding, a_.m128d[i]); + } + #else switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { #if defined(simde_math_nearbyint) @@ -2158,42 +2334,50 @@ simde_mm256_round_pd (simde__m256d a, const int rounding) { break; #endif - #if defined(simde_math_round) case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.d256 = __lasx_xvfrintrne_d(a); + #elif defined(simde_math_round) for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { r_.f64[i] = simde_math_round(a_.f64[i]); } + #endif break; - #endif - #if defined(simde_math_floor) case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.d256 = __lasx_xvfrintrm_d(a); + #elif defined(simde_math_floor) for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { r_.f64[i] = simde_math_floor(a_.f64[i]); } + #endif break; - #endif - #if defined(simde_math_ceil) case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.d256 = __lasx_xvfrintrp_d(a); + #elif defined(simde_math_ceil) for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { r_.f64[i] = simde_math_ceil(a_.f64[i]); } + #endif break; - #endif - #if defined(simde_math_trunc) case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.d256 = __lasx_xvfrintrz_d(a); + #elif defined(simde_math_trunc) for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { r_.f64[i] = simde_math_trunc(a_.f64[i]); } + #endif break; - #endif default: HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_pd()); } - + #endif return simde__m256d_from_private(r_); } #if defined(SIMDE_X86_AVX_NATIVE) @@ -2201,7 +2385,7 @@ simde_mm256_round_pd (simde__m256d a, const int rounding) { #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_round_pd_r_, \ + simde_mm256_round_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_round_pd_a_ = simde__m256d_to_private(a); \ \ for (size_t simde_mm256_round_pd_i = 0 ; simde_mm256_round_pd_i < (sizeof(simde_mm256_round_pd_r_.m128d) / sizeof(simde_mm256_round_pd_r_.m128d[0])) ; simde_mm256_round_pd_i++) { \ @@ -2447,66 +2631,119 @@ simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8) simde__m128d_private a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m128i t_; + #endif switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: - a_.i64[0] = (a_.f64[0] == b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_seq_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = (a_.f64[0] == b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_LT_OQ: case SIMDE_CMP_LT_OS: - a_.i64[0] = (a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_slt_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = (a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_LE_OQ: case SIMDE_CMP_LE_OS: - a_.i64[0] = (a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_sle_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = (a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_UNORD_Q: case SIMDE_CMP_UNORD_S: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) || (b_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cun_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = ((a_.f64[0] != a_.f64[0]) || (b_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NEQ_UQ: case SIMDE_CMP_NEQ_US: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NEQ_OQ: case SIMDE_CMP_NEQ_OS: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cne_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NLT_UQ: case SIMDE_CMP_NLT_US: - a_.i64[0] = !(a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_cult_d(a_.lsx_f64, b_.lsx_f64); + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i64[0] = !(a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NLE_UQ: case SIMDE_CMP_NLE_US: - a_.i64[0] = !(a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_cule_d(a_.lsx_f64, b_.lsx_f64); + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i64[0] = !(a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_ORD_Q: case SIMDE_CMP_ORD_S: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cor_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_EQ_UQ: case SIMDE_CMP_EQ_US: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) | (b_.f64[0] != b_.f64[0]) | (a_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cueq_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = ((a_.f64[0] != a_.f64[0]) | (b_.f64[0] != b_.f64[0]) | (a_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NGE_UQ: case SIMDE_CMP_NGE_US: - a_.i64[0] = !(a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cult_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = !(a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_NGT_UQ: case SIMDE_CMP_NGT_US: - a_.i64[0] = !(a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_cule_d(a_.lsx_f64, b_.lsx_f64), 0x00); + #else + a_.i64[0] = !(a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_FALSE_OQ: @@ -2516,12 +2753,21 @@ simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8) case SIMDE_CMP_GE_OQ: case SIMDE_CMP_GE_OS: - a_.i64[0] = (a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64); + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i64[0] = (a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: - a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0x00); + #else + a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); + #endif break; case SIMDE_CMP_TRUE_UQ: @@ -2550,86 +2796,156 @@ simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8) simde__m128_private a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m128i t_; + #endif switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: - a_.i32[0] = (a_.f32[0] == b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_seq_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = (a_.f32[0] == b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_LT_OQ: case SIMDE_CMP_LT_OS: - a_.i32[0] = (a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_slt_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = (a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_LE_OQ: case SIMDE_CMP_LE_OS: - a_.i32[0] = (a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_sle_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = (a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_UNORD_Q: case SIMDE_CMP_UNORD_S: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) || (b_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = ((a_.f32[0] != a_.f32[0]) || (b_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NEQ_UQ: case SIMDE_CMP_NEQ_US: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NEQ_OQ: case SIMDE_CMP_NEQ_OS: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cne_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NLT_UQ: case SIMDE_CMP_NLT_US: - a_.i32[0] = !(a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_cult_s(a_.lsx_f32, b_.lsx_f32); + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i32[0] = !(a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NLE_UQ: case SIMDE_CMP_NLE_US: - a_.i32[0] = !(a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_cule_s(a_.lsx_f32, b_.lsx_f32); + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i32[0] = !(a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_ORD_Q: case SIMDE_CMP_ORD_S: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cor_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_EQ_UQ: case SIMDE_CMP_EQ_US: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) | (b_.f32[0] != b_.f32[0]) | (a_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cueq_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = ((a_.f32[0] != a_.f32[0]) | (b_.f32[0] != b_.f32[0]) | (a_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NGE_UQ: case SIMDE_CMP_NGE_US: - a_.i32[0] = !(a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cult_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = !(a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_NGT_UQ: case SIMDE_CMP_NGT_US: - a_.i32[0] = !(a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_cule_s(a_.lsx_f32, b_.lsx_f32), 0x00); + #else + a_.i32[0] = !(a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_FALSE_OQ: case SIMDE_CMP_FALSE_OS: - a_.i32[0] = INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i32[0] = INT32_C(0); + #else + a_.i32[0] = INT32_C(0); + #endif break; case SIMDE_CMP_GE_OQ: case SIMDE_CMP_GE_OS: - a_.i32[0] = (a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32); + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + #else + a_.i32[0] = (a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: - a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0x00); + #else + a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); + #endif break; case SIMDE_CMP_TRUE_UQ: case SIMDE_CMP_TRUE_US: - a_.i32[0] = ~INT32_C(0); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i32[0] = ~INT32_C(0); + #else + a_.i32[0] = ~INT32_C(0); + #endif break; default: @@ -2659,148 +2975,177 @@ simde_mm256_cmp_pd r_, a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i t_; + #endif switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_seq_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_LT_OQ: case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_slt_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_LE_OQ: case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_sle_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_UNORD_Q: case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cun_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NEQ_UQ: case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cune_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NEQ_OQ: case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cne_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == a_.f64) & (b_.f64 == b_.f64) & (a_.f64 != b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NLT_UQ: case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_cult_d(a_.d256, b_.d256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 < b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NLE_UQ: case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_cule_d(a_.d256, b_.d256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 <= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_ORD_Q: case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cor_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ((a_.f64 == a_.f64) & (b_.f64 == b_.f64))); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_EQ_UQ: case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cueq_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64) | (a_.f64 == b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NGE_UQ: case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cult_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 >= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_NGT_UQ: case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cule_d(a_.d256, b_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 > b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; @@ -2812,24 +3157,29 @@ simde_mm256_cmp_pd case SIMDE_CMP_GE_OQ: case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_clt_d(a_.d256, b_.d256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_clt_d(b_.d256, a_.d256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); + r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); } #endif break; @@ -2884,11 +3234,21 @@ simde_mm256_cmp_ps r_, a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i t_; + #endif + #if defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) + for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { + SIMDE_CONSTIFY_32_(simde_mm_cmp_ps, r_.m128[i], (HEDLEY_UNREACHABLE(), simde_mm_undefined_ps()), imm8, a_.m128[i], b_.m128[i]); + } + #else switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_seq_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32)); #else SIMDE_VECTORIZE @@ -2900,7 +3260,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_LT_OQ: case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_slt_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); #else SIMDE_VECTORIZE @@ -2912,7 +3274,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_LE_OQ: case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_sle_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); #else SIMDE_VECTORIZE @@ -2924,7 +3288,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_UNORD_Q: case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cun_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32)); #else SIMDE_VECTORIZE @@ -2936,7 +3302,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_NEQ_UQ: case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cune_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); #else SIMDE_VECTORIZE @@ -2948,7 +3316,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_NEQ_OQ: case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cne_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == a_.f32) & (b_.f32 == b_.f32) & (a_.f32 != b_.f32)); #else SIMDE_VECTORIZE @@ -2960,7 +3330,10 @@ simde_mm256_cmp_ps case SIMDE_CMP_NLT_UQ: case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_clt_s(a_.f256, b_.f256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 < b_.f32)); #else SIMDE_VECTORIZE @@ -2972,7 +3345,10 @@ simde_mm256_cmp_ps case SIMDE_CMP_NLE_UQ: case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_cle_s(a_.f256, b_.f256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 <= b_.f32)); #else SIMDE_VECTORIZE @@ -2984,7 +3360,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_ORD_Q: case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cor_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ((a_.f32 == a_.f32) & (b_.f32 == b_.f32))); #else SIMDE_VECTORIZE @@ -2996,7 +3374,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_EQ_UQ: case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cueq_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32) | (a_.f32 == b_.f32)); #else SIMDE_VECTORIZE @@ -3008,7 +3388,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_NGE_UQ: case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cult_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 >= b_.f32)); #else SIMDE_VECTORIZE @@ -3020,7 +3402,9 @@ simde_mm256_cmp_ps case SIMDE_CMP_NGT_UQ: case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvfcmp_cule_s(a_.f256, b_.f256); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 > b_.f32)); #else SIMDE_VECTORIZE @@ -3037,7 +3421,10 @@ simde_mm256_cmp_ps case SIMDE_CMP_GE_OQ: case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_cult_s(a_.f256, b_.f256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); #else SIMDE_VECTORIZE @@ -3049,7 +3436,10 @@ simde_mm256_cmp_ps case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + t_ = __lasx_xvfcmp_cule_s(a_.f256, b_.f256); + r_.i256 = __lasx_xvnor_v(t_, t_); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); #else SIMDE_VECTORIZE @@ -3067,7 +3457,7 @@ simde_mm256_cmp_ps default: HEDLEY_UNREACHABLE(); } - + #endif return simde__m256_from_private(r_); } #if defined(__clang__) && defined(__AVX512DQ__) @@ -3089,11 +3479,11 @@ simde_mm256_cmp_ps simde_mm256_cmp_ps_r; \ })) #elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps(a, b, imm8) + #define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps((a), (b), (imm8)) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm256_cmp_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_cmp_ps_r_, \ + simde_mm256_cmp_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_cmp_ps_a_ = simde__m256_to_private((a)), \ simde_mm256_cmp_ps_b_ = simde__m256_to_private((b)); \ \ @@ -3158,6 +3548,13 @@ simde__m256d simde_mm256_cvtepi32_pd (simde__m128i a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtepi32_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256d_private a_; + a_.m128d_private[0].lsx_i64 = a; + a_.i256 = __lasx_xvpermi_q(a_.i256, a_.i256, 0x00); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + a_.d256 = __lasx_xvffintl_d_w(a_.i256); + return simde__m256d_from_private(a_); #else simde__m256d_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); @@ -3180,6 +3577,8 @@ simde__m256 simde_mm256_cvtepi32_ps (simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtepi32_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvffint_s_w(a); #else simde__m256_private r_; simde__m256i_private a_ = simde__m256i_to_private(a); @@ -3202,6 +3601,11 @@ simde__m128i simde_mm256_cvtpd_epi32 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtpd_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256d_private a_; + a_.i256 = __lasx_xvftintrne_w_d(a, a); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + return a_.m128d_private[0].lsx_i64; #else simde__m128i_private r_; simde__m256d_private a_ = simde__m256d_to_private(a); @@ -3228,6 +3632,11 @@ simde__m128 simde_mm256_cvtpd_ps (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtpd_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256_private a_; + a_.f256 = __lasx_xvfcvt_s_d(a, a); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + return a_.m128[0]; #else simde__m128_private r_; simde__m256d_private a_ = simde__m256d_to_private(a); @@ -3250,6 +3659,8 @@ simde__m256i simde_mm256_cvtps_epi32 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtps_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvftintrne_w_s(a); #else simde__m256i_private r_; simde__m256_private a_ = simde__m256_to_private(a); @@ -3276,6 +3687,10 @@ simde__m256d simde_mm256_cvtps_pd (simde__m128 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtps_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256_private a_; a_.m128[0] = a; + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + return __lasx_xvfcvtl_d_s(a_.f256); #else simde__m256d_private r_; simde__m128_private a_ = simde__m128_to_private(a); @@ -3320,6 +3735,8 @@ simde_mm256_cvtsi256_si32 (simde__m256i a) { HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ HEDLEY_MSVC_VERSION_CHECK(19,14,0)) return _mm256_cvtsi256_si32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvpickve2gr_w(a, 0); #else simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i32[0]; @@ -3355,6 +3772,11 @@ simde__m128i simde_mm256_cvttpd_epi32 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttpd_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i_private a_; + a_.i256 = __lasx_xvftintrz_w_d(a, a); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + return a_.m128i[0]; #else simde__m128i_private r_; simde__m256d_private a_ = simde__m256d_to_private(a); @@ -3381,6 +3803,8 @@ simde__m256i simde_mm256_cvttps_epi32 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttps_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvftintrz_w_s(a); #else simde__m256i_private r_; simde__m256_private a_ = simde__m256_to_private(a); @@ -3407,6 +3831,8 @@ simde__m256 simde_mm256_div_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_div_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfdiv_s(a, b); #else simde__m256_private r_, @@ -3438,6 +3864,8 @@ simde__m256d simde_mm256_div_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_div_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfdiv_d(a, b); #else simde__m256d_private r_, @@ -3539,7 +3967,8 @@ simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3557,7 +3986,8 @@ simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3575,8 +4005,11 @@ simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_insert_epi32(a, i, index) __lasx_xvinsgr2vr_w(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insert_epi32 @@ -3597,6 +4030,8 @@ simde_mm256_insert_epi64 (simde__m256i a, int64_t i, const int index) (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) && \ SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) #define simde_mm256_insert_epi64(a, i, index) _mm256_insert_epi64(a, i, index) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_insert_epi64(a, i, index) __lasx_xvinsgr2vr_d(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm256_insert_epi64 @@ -3613,6 +4048,9 @@ simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8) return simde__m256d_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_pd(a, b, imm8) _mm256_insertf128_pd(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_pd #define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8) @@ -3628,6 +4066,9 @@ simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8) return simde__m256_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_ps(a, b, imm8) _mm256_insertf128_ps(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_ps #define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8) @@ -3643,6 +4084,9 @@ simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int im return simde__m256i_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_si256(a, b, imm8) _mm256_insertf128_si256(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_si256 #define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8) @@ -3668,8 +4112,11 @@ simde_mm256_extract_epi32 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i32[index]; } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_extract_epi32(a, index) __lasx_xvpickve2gr_w(a, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_extract_epi32 @@ -3687,6 +4134,8 @@ simde_mm256_extract_epi64 (simde__m256i a, const int index) #if !defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0) #define simde_mm256_extract_epi64(a, index) _mm256_extract_epi64(a, index) #endif +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_extract_epi64(a, index) __lasx_xvpickve2gr_d(a, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm256_extract_epi64 @@ -3698,6 +4147,8 @@ simde__m256i simde_mm256_lddqu_si256 (simde__m256i const * mem_addr) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(mem_addr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); @@ -3714,6 +4165,8 @@ simde__m256d simde_mm256_load_pd (const double mem_addr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_load_pd(mem_addr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvld(mem_addr, 0); #else simde__m256d r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), sizeof(r)); @@ -3730,6 +4183,8 @@ simde__m256 simde_mm256_load_ps (const float mem_addr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_load_ps(mem_addr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)__lasx_xvld(mem_addr, 0); #else simde__m256 r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), sizeof(r)); @@ -3746,6 +4201,8 @@ simde__m256i simde_mm256_load_si256 (simde__m256i const * mem_addr) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_load_si256(mem_addr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); @@ -3762,6 +4219,14 @@ simde__m256d simde_mm256_loadu_pd (const double a[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256d)__lasx_xvld(a, 0); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + simde__m256d_private r_; + for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { + r_.m128d[i] = simde_mm_loadu_pd(a + 2*i); + } + return simde__m256d_from_private(r_); #else simde__m256d r; simde_memcpy(&r, a, sizeof(r)); @@ -3778,6 +4243,8 @@ simde__m256 simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return (simde__m256)__lasx_xvld(a, 0); #else simde__m256 r; simde_memcpy(&r, a, sizeof(r)); @@ -3789,76 +4256,100 @@ simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) { #define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi8(mem_addr) _mm256_loadu_epi8(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi8(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, mem_addr, sizeof(r)); return r; #endif } +#endif #define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi8 #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi16(mem_addr) _mm256_loadu_epi16(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi16(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, mem_addr, sizeof(r)); return r; #endif } +#endif #define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi16 #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi32(mem_addr) _mm256_loadu_epi32(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi32(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, mem_addr, sizeof(r)); return r; #endif } +#endif #define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi32 #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi64(mem_addr) _mm256_loadu_epi64(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi64(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, mem_addr, sizeof(r)); return r; #endif } +#endif #define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi64 @@ -3870,6 +4361,8 @@ simde__m256i simde_mm256_loadu_si256 (void const * mem_addr) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(const __m256i*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); #else simde__m256i r; simde_memcpy(&r, mem_addr, sizeof(r)); @@ -3886,6 +4379,11 @@ simde__m256 simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) return _mm256_loadu2_m128(hiaddr, loaddr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256_private r_; + r_.m128_private[1].lsx_i64 = __lsx_vld(hiaddr, 0); + r_.m128_private[0].lsx_i64 = __lsx_vld(loaddr, 0); + return r_.f256; #else return simde_mm256_insertf128_ps(simde_mm256_castps128_ps256(simde_mm_loadu_ps(loaddr)), @@ -3902,6 +4400,11 @@ simde__m256d simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) return _mm256_loadu2_m128d(hiaddr, loaddr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256d_private r_; + r_.m128d_private[1].lsx_i64 = __lsx_vld(hiaddr, 0); + r_.m128d_private[0].lsx_i64 = __lsx_vld(loaddr, 0); + return r_.d256; #else return simde_mm256_insertf128_pd(simde_mm256_castpd128_pd256(simde_mm_loadu_pd(loaddr)), @@ -3918,6 +4421,11 @@ simde__m256i simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) { #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) return _mm256_loadu2_m128i(hiaddr, loaddr); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i_private r_; + r_.m128i[1] = __lsx_vld(hiaddr, 0); + r_.m128i[0] = __lsx_vld(loaddr, 0); + return r_.i256; #else return simde_mm256_insertf128_si256(simde_mm256_castsi128_si256(simde_mm_loadu_si128(loaddr)), @@ -3931,7 +4439,7 @@ simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr SIMDE_FUNCTION_ATTRIBUTES simde__m128d -simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { +simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { #if defined(SIMDE_X86_AVX_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) return _mm_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask)); @@ -3939,19 +4447,28 @@ simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde return _mm_maskload_pd(mem_addr, mask); #endif #else - simde__m128d_private - mem_ = simde__m128d_to_private(simde_mm_loadu_pd(mem_addr)), - r_; - simde__m128i_private mask_ = simde__m128i_to_private(mask); + simde__m128d_private r_; + simde__m128i_private + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63)); + mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde_mm_and_pd(simde_mm_load_pd(mem_addr), + simde__m128d_from_wasm_v128(wasm_i64x2_shr(mask_.wasm_v128, 63))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.lsx_i64 = __lsx_vsrli_d(mask_.lsx_i64, 63); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63); + for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { + mask_shr_.i64[i] = mask_.i64[i] >> 63; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = mask_shr_.i64[i] ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); + } return simde__m128d_from_private(r_); #endif @@ -3974,10 +4491,9 @@ simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], si simde__m256d_private r_; simde__m256i_private mask_ = simde__m256i_to_private(mask); - r_ = simde__m256d_to_private(simde_mm256_loadu_pd(mem_addr)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] &= mask_.i64[i] >> 63; + r_.f64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); } return simde__m256d_from_private(r_); @@ -3998,20 +4514,30 @@ simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde return _mm_maskload_ps(mem_addr, mask); #endif #else - simde__m128_private - mem_ = simde__m128_to_private(simde_mm_loadu_ps(mem_addr)), - r_; - simde__m128i_private mask_ = simde__m128i_to_private(mask); + simde__m128_private r_; + simde__m128i_private + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31)); + mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde_mm_and_ps(simde_mm_load_ps(mem_addr), + simde__m128_from_wasm_v128(wasm_i32x4_shr(mask_.wasm_v128, 31))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.lsx_i64 = __lsx_vsrli_w(mask_.lsx_i64, 31); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31); + for (size_t i = 0 ; i < (sizeof(mask_.i32) / sizeof(mask_.i32[0])) ; i++) { + mask_shr_.i32[i] = mask_.i32[i] >> 31; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = mask_shr_.i32[i] ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); + } + return simde__m128_from_private(r_); #endif } @@ -4022,7 +4548,7 @@ simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde SIMDE_FUNCTION_ATTRIBUTES simde__m256 -simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { +simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask) { #if defined(SIMDE_X86_AVX_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) return _mm256_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask)); @@ -4033,10 +4559,9 @@ simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], si simde__m256_private r_; simde__m256i_private mask_ = simde__m256i_to_private(mask); - r_ = simde__m256_to_private(simde_mm256_loadu_ps(mem_addr)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] &= mask_.i32[i] >> 31; + r_.f32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); } return simde__m256_from_private(r_); @@ -4056,15 +4581,27 @@ simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m12 #else _mm_maskstore_pd(mem_addr, mask, a); #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + if (__lsx_vpickve2gr_d(mask, 0) < 0) + __lsx_vstelm_d(HEDLEY_REINTERPRET_CAST(simde__m128i, a), mem_addr, 0, 0); + if (__lsx_vpickve2gr_d(mask, 1) < 0) + __lsx_vstelm_d(HEDLEY_REINTERPRET_CAST(simde__m128i, a), mem_addr, 8, 1); #else simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128d_private a_ = simde__m128d_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.f64[i]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 0)) & 0x8000000000000000ull) != 0) + mem_addr[0] = wasm_f64x2_extract_lane(a_.wasm_v128, 0); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 1)) & 0x8000000000000000ull) != 0) + mem_addr[1] = wasm_f64x2_extract_lane(a_.wasm_v128, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + if (mask_.u64[i] >> 63) + mem_addr[i] = a_.f64[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -4081,6 +4618,10 @@ simde_mm256_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__ #else _mm256_maskstore_pd(mem_addr, mask, a); #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i r_ = __lasx_xvld(mem_addr, 0); mask = __lasx_xvslti_d(mask, 0); + r_ = __lasx_xvbitsel_v(r_, HEDLEY_REINTERPRET_CAST(simde__m256i, a), mask); + __lasx_xvst(r_, mem_addr, 0); #else simde__m256i_private mask_ = simde__m256i_to_private(mask); simde__m256d_private a_ = simde__m256d_to_private(a); @@ -4106,15 +4647,30 @@ simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m12 #else _mm_maskstore_ps(mem_addr, mask, a); #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m128i r_ = __lsx_vld(mem_addr, 0); mask = __lsx_vslti_w(mask, 0); + r_ = __lsx_vbitsel_v(r_, HEDLEY_REINTERPRET_CAST(simde__m128i, a), mask); + __lsx_vst(r_, mem_addr, 0); #else simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128_private a_ = simde__m128_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 0)) & 0x80000000ull) != 0) + mem_addr[0] = wasm_f32x4_extract_lane(a_.wasm_v128, 0); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 1)) & 0x80000000ull) != 0) + mem_addr[1] = wasm_f32x4_extract_lane(a_.wasm_v128, 1); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 2)) & 0x80000000ull) != 0) + mem_addr[2] = wasm_f32x4_extract_lane(a_.wasm_v128, 2); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 3)) & 0x80000000ull) != 0) + mem_addr[3] = wasm_f32x4_extract_lane(a_.wasm_v128, 3); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + if (mask_.u32[i] & (UINT32_C(1) << 31)) + mem_addr[i] = a_.f32[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -4131,6 +4687,10 @@ simde_mm256_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__ #else _mm256_maskstore_ps(mem_addr, mask, a); #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i r_ = __lasx_xvld(mem_addr, 0); mask = __lasx_xvslti_w(mask, 0); + r_ = __lasx_xvbitsel_v(r_, HEDLEY_REINTERPRET_CAST(simde__m256i, a), mask); + __lasx_xvst(r_, mem_addr, 0); #else simde__m256i_private mask_ = simde__m256i_to_private(mask); simde__m256_private a_ = simde__m256_to_private(a); @@ -4152,6 +4712,8 @@ simde__m256 simde_mm256_min_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_min_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmin_s(a, b); #else simde__m256_private r_, @@ -4181,6 +4743,8 @@ simde__m256d simde_mm256_min_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_min_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmin_d(a, b); #else simde__m256d_private r_, @@ -4210,6 +4774,8 @@ simde__m256 simde_mm256_max_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_max_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmax_s(a, b); #else simde__m256_private r_, @@ -4239,6 +4805,8 @@ simde__m256d simde_mm256_max_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_max_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmax_d(a, b); #else simde__m256d_private r_, @@ -4268,6 +4836,9 @@ simde__m256d simde_mm256_movedup_pd (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_movedup_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i r_ = __lasx_xvrepl128vei_d(HEDLEY_REINTERPRET_CAST(simde__m256i, a), 0); + return HEDLEY_REINTERPRET_CAST(simde__m256d, r_); #else simde__m256d_private r_, @@ -4295,6 +4866,9 @@ simde__m256 simde_mm256_movehdup_ps (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_movehdup_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i r_ = __lasx_xvshuf4i_h(HEDLEY_REINTERPRET_CAST(simde__m256i, a), 0b11101110); + return (HEDLEY_REINTERPRET_CAST(simde__m256, r_)); #else simde__m256_private r_, @@ -4322,6 +4896,9 @@ simde__m256 simde_mm256_moveldup_ps (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_moveldup_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256i r_ = __lasx_xvshuf4i_h(HEDLEY_REINTERPRET_CAST(simde__m256i, a), 0b01000100); + return (HEDLEY_REINTERPRET_CAST(simde__m256, r_)); #else simde__m256_private r_, @@ -4353,10 +4930,15 @@ simde_mm256_movemask_ps (simde__m256 a) { simde__m256_private a_ = simde__m256_to_private(a); int r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r |= (a_.u32[i] >> 31) << i; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvmskltz_w(a_.i256); + r = (__lasx_xvpickve2gr_w(a_.i256, 0) | (__lasx_xvpickve2gr_w(a_.i256, 4) << 4)); + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r |= (a_.u32[i] >> 31) << i; + } + #endif return r; #endif @@ -4375,10 +4957,15 @@ simde_mm256_movemask_pd (simde__m256d a) { simde__m256d_private a_ = simde__m256d_to_private(a); int r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r |= (a_.u64[i] >> 63) << i; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvmskltz_d(a_.i256); + r = (__lasx_xvpickve2gr_w(a_.i256, 0) | (__lasx_xvpickve2gr_w(a_.i256, 4) << 2)); + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r |= (a_.u64[i] >> 63) << i; + } + #endif return r; #endif @@ -4393,6 +4980,8 @@ simde__m256 simde_mm256_mul_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_mul_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmul_s(a, b); #else simde__m256_private r_, @@ -4424,6 +5013,8 @@ simde__m256d simde_mm256_mul_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_mul_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmul_d(a, b); #else simde__m256d_private r_, @@ -4461,7 +5052,9 @@ simde_mm256_or_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvor_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_or_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_mm_or_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -4492,7 +5085,9 @@ simde_mm256_or_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvor_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_or_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_mm_or_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -4529,6 +5124,9 @@ simde_mm256_permute_ps (simde__m256 a, const int imm8) } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_permute_ps(a, imm8) HEDLEY_REINTERPRET_CAST(simde__m256, \ + __lasx_xvshuf4i_w(HEDLEY_REINTERPRET_CAST(simde__m256i, a), imm8)) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_permute_ps @@ -4543,10 +5141,14 @@ simde_mm256_permute_pd (simde__m256d a, const int imm8) r_, a_ = simde__m256d_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvshuf_d(simde_mm256_set_epi64x((imm8 >> 3) & 1, (imm8 >> 2) & 1, (imm8 >> 1) & 1, imm8 & 1), a_.i256, a_.i256); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; + } + #endif return simde__m256d_from_private(r_); } @@ -4575,6 +5177,10 @@ simde_mm_permute_ps (simde__m128 a, const int imm8) } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm_permute_ps(a, imm8) HEDLEY_REINTERPRET_CAST(simde__m128, __lsx_vshuf4i_w(HEDLEY_REINTERPRET_CAST(simde__m128i, a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_permute_ps(a, imm8) simde__m128_from_wasm_v128(wasm_i32x4_shuffle(simde__m128_to_wasm_v128(a), simde__m128_to_wasm_v128(a), ((imm8) & 3), (((imm8) >> 2) & 3 ), (((imm8) >> 4) & 3), (((imm8) >> 6) & 3))) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm_permute_ps @@ -4599,6 +5205,10 @@ simde_mm_permute_pd (simde__m128d a, const int imm8) } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm_permute_pd(a, imm8) HEDLEY_REINTERPRET_CAST(simde__m128d, __lsx_vshuf4i_d(HEDLEY_REINTERPRET_CAST(simde__m128i, a), HEDLEY_REINTERPRET_CAST(simde__m128i, a), (imm8 & 1) | (((imm8 >> 1) & 1) << 2))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_permute_pd(a, imm8) simde__m128d_from_wasm_v128(wasm_i64x2_shuffle(simde__m128d_to_wasm_v128(a), simde__m128d_to_wasm_v128(a), ((imm8) & 1), (((imm8) >> 1) & 1 ))) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm_permute_pd @@ -4616,10 +5226,20 @@ simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) { a_ = simde__m128_to_private(a); simde__m128i_private b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[b_.i32[i] & 3]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_i64 = __lsx_vshuf_w(__lsx_vand_v(b_.lsx_i64, __lsx_vreplgr2vr_w(3)), a_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_make( + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 0) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 1) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 2) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 3) & 3])); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = a_.f32[b_.i32[i] & 3]; + } + #endif return simde__m128_from_private(r_); #endif @@ -4640,10 +5260,18 @@ simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) { a_ = simde__m128d_to_private(a); simde__m128i_private b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_i64 = __lsx_vshuf_d(__lsx_vsrli_d(__lsx_vand_v(b_.lsx_i64, __lsx_vreplgr2vr_d(2)), 1), a_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_make( + (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 0) >> 1) & 1]), + (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 1) >> 1) & 1])); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; + } + #endif return simde__m128d_from_private(r_); #endif @@ -4664,10 +5292,14 @@ simde_mm256_permutevar_ps (simde__m256 a, simde__m256i b) { a_ = simde__m256_to_private(a); simde__m256i_private b_ = simde__m256i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvshuf_w(__lasx_xvand_v(b_.i256, __lasx_xvreplgr2vr_w(3)), a_.i256, a_.i256); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)]; + } + #endif return simde__m256_from_private(r_); #endif @@ -4688,10 +5320,14 @@ simde_mm256_permutevar_pd (simde__m256d a, simde__m256i b) { a_ = simde__m256d_to_private(a); simde__m256i_private b_ = simde__m256i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvshuf_d(__lasx_xvsrli_d(__lasx_xvand_v(b_.i256, __lasx_xvreplgr2vr_d(2)), 1), a_.i256, a_.i256); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)]; + } + #endif return simde__m256d_from_private(r_); #endif @@ -4772,6 +5408,8 @@ simde__m256 simde_mm256_rcp_ps (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_rcp_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfrecip_s(a); #else simde__m256_private r_, @@ -4800,6 +5438,9 @@ simde__m256 simde_mm256_rsqrt_ps (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_rsqrt_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(__loongarch_frecipe) && (HEDLEY_GCC_VERSION_CHECK(14,1,0) || SIMDE_DETECT_CLANG_VERSION_CHECK(18,0,0)) + //need to add -mfrecipe to enable __loongarch_frecipe + return __lasx_xvfrsqrte_s(a); #else simde__m256_private r_, @@ -4993,14 +5634,21 @@ simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8) a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3]; - r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3]; - r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3]; - r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3]; - r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3]; - r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3]; - r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3]; - r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3]; + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256_private m_; m_.i256 = + simde_mm256_set_epi32(((imm8 >> 6) & 3) + 4, ((imm8 >> 4) & 3) + 4, (imm8 >> 2) & 3, (imm8 >> 0) & 3, + ((imm8 >> 6) & 3) + 4, ((imm8 >> 4) & 3) + 4, (imm8 >> 2) & 3, (imm8 >> 0) & 3); + r_.i256 = __lasx_xvshuf_w(m_.i256, a_.i256, b_.i256); + #else + r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3]; + r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3]; + r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3]; + r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3]; + r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3]; + r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3]; + r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3]; + r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3]; + #endif return simde__m256_from_private(r_); } @@ -5037,10 +5685,16 @@ simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8) a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - r_.f64[0] = a_.f64[((imm8 ) & 1) ]; - r_.f64[1] = b_.f64[((imm8 >> 1) & 1) ]; - r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2]; - r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2]; + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + simde__m256d_private m_; m_.i256 = + simde_mm256_set_epi64x(((imm8 >> 3) & 1) | 2, ((imm8 >> 2) & 1), ((imm8 >> 1) & 1) | 2, (imm8 >> 0) & 1); + r_.i256 = __lasx_xvshuf_w(m_.i256, a_.i256, b_.i256); + #else + r_.f64[0] = a_.f64[((imm8 ) & 1) ]; + r_.f64[1] = b_.f64[((imm8 >> 1) & 1) ]; + r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2]; + r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2]; + #endif return simde__m256d_from_private(r_); } @@ -5049,8 +5703,8 @@ simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm256_shuffle_pd(a, b, imm8) \ simde_mm256_set_m128d( \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 0) & 3), \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 2) & 3)) + simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 2) & 3), \ + simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 0) & 3)) #elif defined(SIMDE_SHUFFLE_VECTOR_) #define simde_mm256_shuffle_pd(a, b, imm8) \ SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \ @@ -5069,6 +5723,8 @@ simde__m256 simde_mm256_sqrt_ps (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_sqrt_ps(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsqrt_s(a); #else simde__m256_private r_, @@ -5099,6 +5755,8 @@ simde__m256d simde_mm256_sqrt_pd (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_sqrt_pd(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsqrt_d(a); #else simde__m256d_private r_, @@ -5129,6 +5787,8 @@ void simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_ps(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); #endif @@ -5143,6 +5803,8 @@ void simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_pd(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); #endif @@ -5157,6 +5819,8 @@ void simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_si256(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); #endif @@ -5171,6 +5835,8 @@ void simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_ps(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -5185,6 +5851,13 @@ void simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_pd(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + simde__m256d_private a_ = simde__m256d_to_private(a); + for (size_t i = 0 ; i < (sizeof(a_.m128d) / sizeof(a_.m128d[0])) ; i++) { + simde_mm_storeu_pd(mem_addr + 2*i, a_.m128d[i]); + } #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -5199,6 +5872,8 @@ void simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -5258,6 +5933,10 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); #endif @@ -5272,6 +5951,10 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); #endif @@ -5286,8 +5969,12 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvst(a, mem_addr, 0); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); + simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -5300,6 +5987,8 @@ simde__m256 simde_mm256_sub_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_sub_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_s(a, b); #else simde__m256_private r_, @@ -5345,6 +6034,8 @@ simde__m256d simde_mm256_sub_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_sub_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_d(a, b); #else simde__m256d_private r_, @@ -5467,7 +6158,9 @@ simde_mm256_xor_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvxor_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_xor_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_mm_xor_ps(a_.m128[1], b_.m128[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5498,7 +6191,9 @@ simde_mm256_xor_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvxor_v(a_.i256, b_.i256); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_xor_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_mm_xor_pd(a_.m128d[1], b_.m128d[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5535,6 +6230,8 @@ simde__m256 simde_x_mm256_negate_ps(simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return simde_mm256_xor_ps(a,_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return simde_mm256_xor_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0))); #else simde__m256_private r_, @@ -5558,6 +6255,8 @@ simde__m256d simde_x_mm256_negate_pd(simde__m256d a) { #if defined(SIMDE_X86_AVX2_NATIVE) return simde_mm256_xor_pd(a, _mm256_set1_pd(SIMDE_FLOAT64_C(-0.0))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return simde_mm256_xor_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.0))); #else simde__m256d_private r_, @@ -5587,7 +6286,9 @@ simde_mm256_unpackhi_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvilvh_w(b_.i256, a_.i256); + #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 2, 10, 3, 11, 6, 14, 7, 15); #else r_.f32[0] = a_.f32[2]; @@ -5619,7 +6320,9 @@ simde_mm256_unpackhi_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvilvh_d(b_.i256, a_.i256); + #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7); #else r_.f64[0] = a_.f64[1]; @@ -5647,7 +6350,9 @@ simde_mm256_unpacklo_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvilvl_w(b_.i256, a_.i256); + #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 8, 1, 9, 4, 12, 5, 13); #else r_.f32[0] = a_.f32[0]; @@ -5679,7 +6384,9 @@ simde_mm256_unpacklo_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvilvl_d(b_.i256, a_.i256); + #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6); #else r_.f64[0] = a_.f64[0]; @@ -5701,6 +6408,8 @@ simde__m256 simde_mm256_zextps128_ps256 (simde__m128 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_insertf128_ps(_mm256_setzero_ps(), a, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return simde_mm256_insertf128_ps(simde_mm256_setzero_ps(), a, 0); #else simde__m256_private r_; @@ -5720,6 +6429,8 @@ simde__m256d simde_mm256_zextpd128_pd256 (simde__m128d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_insertf128_pd(_mm256_setzero_pd(), a, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return simde_mm256_insertf128_pd(simde_mm256_setzero_pd(), a, 0); #else simde__m256d_private r_; @@ -5739,6 +6450,8 @@ simde__m256i simde_mm256_zextsi128_si256 (simde__m128i a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_insertf128_si256(_mm256_setzero_si256(), a, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return simde_mm256_insertf128_si256(simde_mm256_setzero_si256(), a, 0); #else simde__m256i_private r_; @@ -5768,6 +6481,10 @@ simde_mm_testc_ps (simde__m128 a, simde__m128 b) { m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); return wasm_i32x4_extract_lane(m, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + a_.lsx_i64 = __lsx_vmskltz_w(a_.lsx_i64); + return __lsx_vpickve2gr_w(a_.lsx_i64, 0) ? 0 : 1; #else uint_fast32_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) @@ -5797,6 +6514,10 @@ simde_mm_testc_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 63); return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + a_.lsx_i64 = __lsx_vmskltz_d(a_.lsx_i64); + return __lsx_vpickve2gr_w(a_.lsx_i64, 0) ? 0 : 1; #else uint_fast64_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) @@ -5824,10 +6545,16 @@ simde_mm256_testc_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= ~a_.u32[i] & b_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmskltz_w(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= ~a_.u32[i] & b_.u32[i]; + } + #endif return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); #endif @@ -5848,10 +6575,16 @@ simde_mm256_testc_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= ~a_.u64[i] & b_.u64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmskltz_d(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= ~a_.u64[i] & b_.u64[i]; + } + #endif return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); #endif @@ -5872,10 +6605,16 @@ simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= ~a_.i32f[i] & b_.i32f[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmsknz_b(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { + r |= ~a_.i32f[i] & b_.i32f[i]; + } + #endif return HEDLEY_STATIC_CAST(int, !r); #endif @@ -5900,6 +6639,10 @@ simde_mm_testz_ps (simde__m128 a, simde__m128 b) { m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); return wasm_i32x4_extract_lane(m, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + a_.lsx_i64 = __lsx_vmskltz_w(a_.lsx_i64); + return __lsx_vpickve2gr_w(a_.lsx_i64, 0) ? 0 : 1; #else uint_fast32_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) @@ -5929,6 +6672,10 @@ simde_mm_testz_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 63); return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + a_.lsx_i64 = __lsx_vmskltz_d(a_.lsx_i64); + return __lsx_vpickve2gr_w(a_.lsx_i64, 0) ? 0 : 1; #else uint_fast64_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) @@ -5956,10 +6703,16 @@ simde_mm256_testz_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= a_.u32[i] & b_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvand_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmskltz_w(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= a_.u32[i] & b_.u32[i]; + } + #endif return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); #endif @@ -5980,10 +6733,16 @@ simde_mm256_testz_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= a_.u64[i] & b_.u64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvand_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmskltz_d(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #else + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= a_.u64[i] & b_.u64[i]; + } + #endif return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); #endif @@ -6004,7 +6763,11 @@ simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvand_v(a_.i256, b_.i256); + a_.i256 = __lasx_xvmsknz_b(a_.i256); + return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]); #else SIMDE_VECTORIZE_REDUCTION(|:r) @@ -6041,6 +6804,11 @@ simde_mm_testnzc_ps (simde__m128 a, simde__m128 b) { m = wasm_v128_or(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); m2 = wasm_v128_or(m2, simde_mm_shuffle_epi32(m2, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); return wasm_i32x4_extract_lane(m, 0) & wasm_i32x4_extract_lane(m2, 0); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m128i m = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + __m128i n = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + m = __lsx_vmskltz_w(m); n = __lsx_vmskltz_w(n); + return (__lsx_vpickve2gr_w(m, 0) != 0) && (__lsx_vpickve2gr_w(n, 0) != 0); #else uint32_t rz = 0, rc = 0; for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { @@ -6072,7 +6840,12 @@ simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) { v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63); v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63); return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) - & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); + & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m128i m = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + __m128i n = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + m = __lsx_vmskltz_d(m); n = __lsx_vmskltz_d(n); + return (__lsx_vpickve2gr_w(m, 0) != 0) && (__lsx_vpickve2gr_w(n, 0) != 0); #else uint64_t rc = 0, rz = 0; for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { @@ -6102,14 +6875,23 @@ simde_mm256_testnzc_ps (simde__m256 a, simde__m256 b) { a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - rc |= ~a_.u32[i] & b_.u32[i]; - rz |= a_.u32[i] & b_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i m = __lasx_xvandn_v(a_.i256, b_.i256); + __m256i n = __lasx_xvand_v(a_.i256, b_.i256); + m = __lasx_xvmskltz_w(m); n = __lasx_xvmskltz_w(n); + rc = __lasx_xvpickve2gr_w(m, 0) + __lasx_xvpickve2gr_w(m, 4); + rz = __lasx_xvpickve2gr_w(n, 0) + __lasx_xvpickve2gr_w(n, 4); + return (rc != 0) && (rz != 0); + #else + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + rc |= ~a_.u32[i] & b_.u32[i]; + rz |= a_.u32[i] & b_.u32[i]; + } - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); + return + (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & + (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -6128,14 +6910,23 @@ simde_mm256_testnzc_pd (simde__m256d a, simde__m256d b) { a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - rc |= ~a_.u64[i] & b_.u64[i]; - rz |= a_.u64[i] & b_.u64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i m = __lasx_xvandn_v(a_.i256, b_.i256); + __m256i n = __lasx_xvand_v(a_.i256, b_.i256); + m = __lasx_xvmskltz_d(m); n = __lasx_xvmskltz_d(n); + rc = __lasx_xvpickve2gr_w(m, 0) + __lasx_xvpickve2gr_w(m, 4); + rz = __lasx_xvpickve2gr_w(n, 0) + __lasx_xvpickve2gr_w(n, 4); + return (rc != 0) && (rz != 0); + #else + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + rc |= ~a_.u64[i] & b_.u64[i]; + rz |= a_.u64[i] & b_.u64[i]; + } - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); + return + (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & + (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -6154,12 +6945,21 @@ simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - rc |= ~a_.i32f[i] & b_.i32f[i]; - rz |= a_.i32f[i] & b_.i32f[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i m = __lasx_xvandn_v(a_.i256, b_.i256); + __m256i n = __lasx_xvand_v(a_.i256, b_.i256); + m = __lasx_xvmsknz_b(m); n = __lasx_xvmsknz_b(n); + rc = __lasx_xvpickve2gr_w(m, 0) + __lasx_xvpickve2gr_w(m, 4); + rz = __lasx_xvpickve2gr_w(n, 0) + __lasx_xvpickve2gr_w(n, 4); + return (rc != 0) && (rz != 0); + #else + for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { + rc |= ~a_.i32f[i] & b_.i32f[i]; + rz |= a_.i32f[i] & b_.i32f[i]; + } - return !!(rc & rz); + return !!(rc & rz); + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) diff --git a/x86/avx2.h b/x86/avx2.h index 1247b5193..6085e5860 100644 --- a/x86/avx2.h +++ b/x86/avx2.h @@ -41,6 +41,8 @@ simde__m256i simde_mm256_abs_epi8 (simde__m256i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_abs_epi8(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvabsd_b(a, __lasx_xvreplgr2vr_b(0)); #else simde__m256i_private r_, @@ -69,6 +71,8 @@ simde__m256i simde_mm256_abs_epi16 (simde__m256i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_abs_epi16(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvabsd_h(a, __lasx_xvreplgr2vr_h(0)); #else simde__m256i_private r_, @@ -97,6 +101,8 @@ simde__m256i simde_mm256_abs_epi32(simde__m256i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_abs_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvabsd_w(a, __lasx_xvreplgr2vr_w(0)); #else simde__m256i_private r_, @@ -125,6 +131,8 @@ simde__m256i simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_add_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvadd_b(a, b); #else simde__m256i_private r_, @@ -156,6 +164,8 @@ simde__m256i simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_add_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvadd_h(a, b); #else simde__m256i_private r_, @@ -201,6 +211,8 @@ simde__m256i simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_add_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvadd_w(a, b); #else simde__m256i_private r_, @@ -246,6 +258,8 @@ simde__m256i simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_add_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvadd_d(a, b); #else simde__m256i_private r_, @@ -318,6 +332,8 @@ simde__m256i simde_mm256_and_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_and_si256(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvand_v(a, b); #else simde__m256i_private r_, @@ -349,6 +365,8 @@ simde__m256i simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_andnot_si256(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvandn_v(a, b); #else simde__m256i_private r_, @@ -378,6 +396,8 @@ simde__m256i simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_adds_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsadd_b(a, b); #else simde__m256i_private r_, @@ -407,6 +427,8 @@ simde__m256i simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_adds_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsadd_h(a, b); #else simde__m256i_private r_, @@ -450,6 +472,8 @@ simde__m256i simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_adds_epu8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsadd_bu(a, b); #else simde__m256i_private r_, @@ -479,6 +503,8 @@ simde__m256i simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_adds_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsadd_hu(a, b); #else simde__m256i_private r_, @@ -508,6 +534,8 @@ simde__m256i simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_avg_epu8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvavgr_bu(a, b); #else simde__m256i_private r_, @@ -532,6 +560,8 @@ simde__m256i simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_avg_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvavgr_hu(a, b); #else simde__m256i_private r_, @@ -569,6 +599,9 @@ simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm_blend_epi32(a, b, imm8) __lsx_vbitsel_v(a, b, \ + simde_mm_set_epi32(-((imm8 >> 3) & 1), -((imm8 >> 2) & 1), -((imm8 >> 1)& 1), -(imm8 & 1))) #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) # define simde_mm_blend_epi32(a, b, imm8) \ simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8))) @@ -596,8 +629,12 @@ simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560) # define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) -#elif defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# undef sr +# define sr(i, j) -((i >> j) & 1) +# define simde_mm256_blend_epi16(a, b, imm8) __lasx_xvbitsel_v(a, b, \ + simde_mm256_set_epi16(sr(imm8, 7), sr(imm8, 6), sr(imm8, 5), sr(imm8, 4), sr(imm8, 3), sr(imm8, 2), sr(imm8, 1), sr(imm8, 0), \ + sr(imm8, 7), sr(imm8, 6), sr(imm8, 5), sr(imm8, 4), sr(imm8, 3), sr(imm8, 2), sr(imm8, 1), sr(imm8, 0))) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_blend_epi16(a, b, imm8) \ simde_mm256_set_m128i( \ @@ -628,6 +665,11 @@ simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# undef sr +# define sr(i, j) -((i >> j) & 1) +# define simde_mm256_blend_epi32(a, b, imm8) __lasx_xvbitsel_v(a, b, \ + simde_mm256_set_epi32(sr(imm8, 7), sr(imm8, 6), sr(imm8, 5), sr(imm8, 4), sr(imm8, 3), sr(imm8, 2), sr(imm8, 1), sr(imm8, 0))) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_blend_epi32(a, b, imm8) \ simde_mm256_set_m128i( \ @@ -645,6 +687,8 @@ simde__m256i simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_blendv_epi8(a, b, mask); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvbitsel_v(a, b, __lasx_xvslti_b(mask, 0)); #else simde__m256i_private r_, @@ -682,6 +726,8 @@ simde__m128i simde_mm_broadcastb_epi8 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_broadcastb_epi8(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lsx_vreplvei_b(a, 0); #else simde__m128i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -704,6 +750,8 @@ simde__m256i simde_mm256_broadcastb_epi8 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_broadcastb_epi8(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_b(__lsx_vpickve2gr_b(a, 0)); #else simde__m256i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -726,6 +774,8 @@ simde__m128i simde_mm_broadcastw_epi16 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_broadcastw_epi16(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lsx_vreplvei_h(a, 0); #else simde__m128i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -748,6 +798,8 @@ simde__m256i simde_mm256_broadcastw_epi16 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_broadcastw_epi16(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_h(__lsx_vpickve2gr_h(a, 0)); #else simde__m256i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -770,6 +822,8 @@ simde__m128i simde_mm_broadcastd_epi32 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_broadcastd_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lsx_vreplvei_w(a, 0); #else simde__m128i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -792,6 +846,8 @@ simde__m256i simde_mm256_broadcastd_epi32 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_broadcastd_epi32(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_w(__lsx_vpickve2gr_w(a, 0)); #else simde__m256i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -814,6 +870,8 @@ simde__m128i simde_mm_broadcastq_epi64 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_broadcastq_epi64(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lsx_vreplvei_d(a, 0); #else simde__m128i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -836,6 +894,8 @@ simde__m256i simde_mm256_broadcastq_epi64 (simde__m128i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_broadcastq_epi64(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvreplgr2vr_d(__lsx_vpickve2gr_d(a, 0)); #else simde__m256i_private r_; simde__m128i_private a_= simde__m128i_to_private(a); @@ -864,7 +924,9 @@ simde_mm_broadcastss_ps (simde__m128 a) { simde__m128_private r_; simde__m128_private a_= simde__m128_to_private(a); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_w(a_.lsx_i64, 0); + #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); #else SIMDE_VECTORIZE @@ -893,6 +955,8 @@ simde_mm256_broadcastss_ps (simde__m128 a) { #if defined(SIMDE_X86_AVX_NATIVE) __m128 tmp = _mm_permute_ps(a_.n, 0); r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvreplgr2vr_w(__lsx_vpickve2gr_w(a_.lsx_i64, 0)); #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0); #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) @@ -931,10 +995,14 @@ simde_mm256_broadcastsd_pd (simde__m128d a) { simde__m256d_private r_; simde__m128d_private a_= simde__m128d_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvreplgr2vr_d(__lsx_vpickve2gr_d(a_.lsx_i64, 0)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[0]; + } + #endif return simde__m256d_from_private(r_); #endif @@ -954,7 +1022,10 @@ simde_mm256_broadcastsi128_si256 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_q(r_.i256, r_.i256, 0x00); + #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i_private[0] = a_; r_.m128i_private[1] = a_; #else @@ -1007,6 +1078,8 @@ simde_mm256_bslli_epi128 (simde__m256i a, const int imm8) (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) #define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_bslli_epi128(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvbsll_v(a, imm8)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_bslli_epi128 @@ -1045,6 +1118,8 @@ simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8) (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_bsrli_epi128(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvbsrl_v(a, imm8)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_bsrli_epi128 @@ -1056,6 +1131,8 @@ simde__m256i simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpeq_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvseq_b(a, b); #else simde__m256i_private r_, @@ -1085,6 +1162,8 @@ simde__m256i simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpeq_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvseq_h(a, b); #else simde__m256i_private r_, @@ -1114,6 +1193,8 @@ simde__m256i simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpeq_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvseq_w(a, b); #else simde__m256i_private r_, @@ -1143,6 +1224,8 @@ simde__m256i simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpeq_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvseq_d(a, b); #else simde__m256i_private r_, @@ -1172,6 +1255,8 @@ simde__m256i simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpgt_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvslt_b(b, a); #else simde__m256i_private r_, @@ -1203,6 +1288,8 @@ simde__m256i simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpgt_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvslt_h(b, a); #else simde__m256i_private r_, @@ -1234,6 +1321,8 @@ simde__m256i simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpgt_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvslt_w(b, a); #else simde__m256i_private r_, @@ -1265,6 +1354,8 @@ simde__m256i simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_cmpgt_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvslt_d(b, a); #else simde__m256i_private r_, @@ -1300,7 +1391,11 @@ simde_mm256_cvtepi8_epi16 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_h_b(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); #else SIMDE_VECTORIZE @@ -1326,7 +1421,12 @@ simde_mm256_cvtepi8_epi32 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_h_b(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_w_h(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8); #else SIMDE_VECTORIZE @@ -1352,10 +1452,18 @@ simde_mm256_cvtepi8_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i8[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_h_b(r_.i256, 0); + r_.i256 = __lasx_xvsllwil_w_h(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_d_w(r_.i256, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = a_.i8[i]; + } + #endif return simde__m256i_from_private(r_); #endif @@ -1374,7 +1482,11 @@ simde_mm256_cvtepi16_epi32 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_w_h(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); #else SIMDE_VECTORIZE @@ -1400,7 +1512,12 @@ simde_mm256_cvtepi16_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_w_h(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_d_w(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16); #else SIMDE_VECTORIZE @@ -1426,7 +1543,11 @@ simde_mm256_cvtepi32_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_d_w(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32); #else SIMDE_VECTORIZE @@ -1452,7 +1573,11 @@ simde_mm256_cvtepu8_epi16 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_hu_bu(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8); #else SIMDE_VECTORIZE @@ -1478,7 +1603,12 @@ simde_mm256_cvtepu8_epi32 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_hu_bu(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_wu_hu(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8); #else SIMDE_VECTORIZE @@ -1504,10 +1634,18 @@ simde_mm256_cvtepu8_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u8[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_hu_bu(r_.i256, 0); + r_.i256 = __lasx_xvsllwil_wu_hu(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_du_wu(r_.i256, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = a_.u8[i]; + } + #endif return simde__m256i_from_private(r_); #endif @@ -1526,7 +1664,11 @@ simde_mm256_cvtepu16_epi32 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_wu_hu(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); #else SIMDE_VECTORIZE @@ -1552,7 +1694,12 @@ simde_mm256_cvtepu16_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvsllwil_wu_hu(r_.i256, 0); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_du_wu(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16); #else SIMDE_VECTORIZE @@ -1578,7 +1725,11 @@ simde_mm256_cvtepu32_epi64 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.m128i_private[0] = a_; + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.i256 = __lasx_xvsllwil_du_wu(r_.i256, 0); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32); #else SIMDE_VECTORIZE @@ -1602,7 +1753,8 @@ simde_mm256_extract_epi8 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i8[index]; } -#if defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -1617,7 +1769,8 @@ simde_mm256_extract_epi16 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i16[index]; } -#if defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2267,11 +2420,11 @@ simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, co return simde__m256_from_private(r_); } #if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) + #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_i32gather_ps - #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) + #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -2724,6 +2877,8 @@ simde__m256i simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_madd_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmaddwod_w_h(__lasx_xvmulwev_w_h(a, b), a, b); #else simde__m256i_private r_, @@ -2768,6 +2923,8 @@ simde__m256i simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_maddubs_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsadd_h(__lasx_xvmulwod_h_bu_b(a, b), __lasx_xvmulwev_h_bu_b(a, b)); #else simde__m256i_private r_, @@ -2803,19 +2960,26 @@ simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m return _mm_maskload_epi32(mem_addr, mask); #else simde__m128i_private - mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi32(mem_addr)), r_, - mask_ = simde__m128i_to_private(mask); + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31)); + mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.lsx_i64 = __lsx_vsrli_w(mask_.lsx_i64, 31); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31); + mask_shr_.i32[i] = mask_.i32[i] >> 31; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0); + } + return simde__m128i_from_private(r_); #endif } @@ -2826,17 +2990,27 @@ simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m SIMDE_FUNCTION_ATTRIBUTES simde__m256i -simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { +simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_maskload_epi32(mem_addr, mask); #else simde__m256i_private mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi32(mem_addr)); + r_, + mask_shr_; + + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.i256 = __lasx_xvsrli_w(mask_.i256, 31); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + mask_shr_.i32[i] = mask_.i32[i] >> 31; + } + #endif SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] &= mask_.i32[i] >> 31; + r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0); } return simde__m256i_from_private(r_); @@ -2849,24 +3023,31 @@ simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { +simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); #else simde__m128i_private - mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi64((mem_addr))), r_, - mask_ = simde__m128i_to_private(mask); + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63)); + mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.lsx_i64 = __lsx_vsrli_d(mask_.lsx_i64, 63); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63); + for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { + mask_shr_.i64[i] = mask_.i64[i] >> 63; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0); + } + return simde__m128i_from_private(r_); #endif } @@ -2883,11 +3064,21 @@ simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde #else simde__m256i_private mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi64((mem_addr))); + r_, + mask_shr_; + + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_shr_.i256 = __lasx_xvsrli_d(mask_.i256, 63); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + mask_shr_.i64[i] = mask_.i64[i] >> 63; + } + #endif SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] &= mask_.i64[i] >> 63; + r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0); } return simde__m256i_from_private(r_); @@ -2907,11 +3098,17 @@ simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128i_private a_ = simde__m128i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_.lsx_i64 = __lsx_vslti_w(mask_.lsx_i64, 0); + a_.lsx_i64 = __lsx_vbitsel_v(__lsx_vld(mem_addr, 0), a_.lsx_i64, mask_.lsx_i64); + __lsx_vst(a_.lsx_i64, mem_addr, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + if (mask_.u32[i] & (UINT32_C(1) << 31)) + mem_addr[i] = a_.i32[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2928,11 +3125,17 @@ simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m25 simde__m256i_private mask_ = simde__m256i_to_private(mask); simde__m256i_private a_ = simde__m256i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_.i256 = __lasx_xvslti_w(mask_.i256, 0); + a_.i256 = __lasx_xvbitsel_v(__lasx_xvld(mem_addr, 0), a_.i256, mask_.i256); + __lasx_xvst(a_.i256, mem_addr, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + if (mask_.u32[i] & (UINT32_C(1) << 31)) + mem_addr[i] = a_.i32[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2949,11 +3152,16 @@ simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128i_private a_ = simde__m128i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.i64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + if (__lsx_vpickve2gr_d(mask_.lsx_i64, 0) < 0) __lsx_vstelm_d(a_.lsx_i64, mem_addr, 0, 0); + if (__lsx_vpickve2gr_d(mask_.lsx_i64, 1) < 0) __lsx_vstelm_d(a_.lsx_i64, mem_addr, 8, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + if (mask_.u64[i] >> 63) + mem_addr[i] = a_.i64[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2970,11 +3178,17 @@ simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m25 simde__m256i_private mask_ = simde__m256i_to_private(mask); simde__m256i_private a_ = simde__m256i_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] & (UINT64_C(1) << 63)) - mem_addr[i] = a_.i64[i]; - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + mask_.i256 = __lasx_xvslti_d(mask_.i256, 0); + a_.i256 = __lasx_xvbitsel_v(__lasx_xvld(mem_addr, 0), a_.i256, mask_.i256); + __lasx_xvst(a_.i256, mem_addr, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + if (mask_.u64[i] & (UINT64_C(1) << 63)) + mem_addr[i] = a_.i64[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2987,6 +3201,8 @@ simde__m256i simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) return _mm256_max_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_b(a, b); #else simde__m256i_private r_, @@ -3016,6 +3232,8 @@ simde__m256i simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_max_epu8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_bu(a, b); #else simde__m256i_private r_, @@ -3045,6 +3263,8 @@ simde__m256i simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_max_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_hu(a, b); #else simde__m256i_private r_, @@ -3074,6 +3294,8 @@ simde__m256i simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_max_epu32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_wu(a, b); #else simde__m256i_private r_, @@ -3103,6 +3325,8 @@ simde__m256i simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_max_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_h(a, b); #else simde__m256i_private r_, @@ -3132,6 +3356,8 @@ simde__m256i simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_max_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmax_w(a, b); #else simde__m256i_private r_, @@ -3161,6 +3387,8 @@ simde__m256i simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) return _mm256_min_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_b(a, b); #else simde__m256i_private r_, @@ -3190,6 +3418,8 @@ simde__m256i simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_min_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_h(a, b); #else simde__m256i_private r_, @@ -3219,6 +3449,8 @@ simde__m256i simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_min_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_w(a, b); #else simde__m256i_private r_, @@ -3248,6 +3480,8 @@ simde__m256i simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_min_epu8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_bu(a, b); #else simde__m256i_private r_, @@ -3277,6 +3511,8 @@ simde__m256i simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_min_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_hu(a, b); #else simde__m256i_private r_, @@ -3306,6 +3542,8 @@ simde__m256i simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_min_epu32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmin_wu(a, b); #else simde__m256i_private r_, @@ -3335,6 +3573,9 @@ int32_t simde_mm256_movemask_epi8 (simde__m256i a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_movemask_epi8(a); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a = __lasx_xvmskltz_b(a); + return (__lasx_xvpickve2gr_w(a, 0) | (__lasx_xvpickve2gr_w(a, 4) << 16)); #else simde__m256i_private a_ = simde__m256i_to_private(a); uint32_t r = 0; @@ -3411,6 +3652,8 @@ simde__m256i simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mul_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmulwev_d_w(a, b); #else simde__m256i_private r_, @@ -3441,6 +3684,8 @@ simde__m256i simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mul_epu32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmulwev_d_wu(a, b); #else simde__m256i_private r_, @@ -3469,6 +3714,8 @@ simde__m256i simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mulhi_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmuh_h(a, b); #else simde__m256i_private r_, @@ -3492,6 +3739,8 @@ simde__m256i simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mulhi_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmuh_hu(a, b); #else simde__m256i_private r_, @@ -3515,6 +3764,12 @@ simde__m256i simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mulhrs_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256i t1_ = __lasx_xvmulwev_w_h(a, b), + t2_ = __lasx_xvmulwod_w_h(a, b); + t1_ = __lasx_xvssrarni_h_w(t1_, t1_, 15); + t2_ = __lasx_xvssrarni_h_w(t2_, t2_, 15); + return __lasx_xvilvl_h(t2_, t1_); #else simde__m256i_private r_, @@ -3538,6 +3793,8 @@ simde__m256i simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mullo_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmul_h(a, b); #else simde__m256i_private a_ = simde__m256i_to_private(a), @@ -3562,6 +3819,8 @@ simde__m256i simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_mullo_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvmul_w(a, b); #else simde__m256i_private a_ = simde__m256i_to_private(a), @@ -3606,6 +3865,8 @@ simde__m256i simde_mm256_or_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_or_si256(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvor_v(a, b); #else simde__m256i_private r_, @@ -3637,6 +3898,8 @@ simde__m256i simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_packs_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssrarni_b_h(b, a, 0); #else simde__m256i_private r_, @@ -3671,6 +3934,8 @@ simde__m256i simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_packs_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssrarni_h_w(b, a, 0); #else simde__m256i_private r_, @@ -3703,6 +3968,8 @@ simde__m256i simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_packus_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssrarni_bu_h(b, a, 0); #else simde__m256i_private r_, @@ -3737,6 +4004,8 @@ simde__m256i simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_packus_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssrarni_hu_w(b, a, 0); #else simde__m256i_private r_, @@ -3805,6 +4074,8 @@ simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_permute4x64_epi64(a, imm8) __lasx_xvpermi_d(a, imm8) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_permute4x64_epi64 @@ -3828,6 +4099,8 @@ simde_mm256_permute4x64_pd (simde__m256d a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_permute4x64_pd(a, imm8) HEDLEY_REINTERPRET_CAST(simde__m256d, __lasx_xvpermi_d(a, imm8)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_permute4x64_pd @@ -3839,6 +4112,8 @@ simde__m256i simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_permutevar8x32_epi32(a, idx); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvperm_w(a, idx); #else simde__m256i_private r_, @@ -3867,6 +4142,8 @@ simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) { #else return _mm256_permutevar8x32_ps(a, idx); #endif + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return HEDLEY_REINTERPRET_CAST(simde__m256, __lasx_xvperm_w(HEDLEY_REINTERPRET_CAST(simde__m256i, a), idx)); #else simde__m256_private r_, @@ -3932,7 +4209,12 @@ simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvslti_b(b, 0); + r_.i256 = __lasx_xvnor_v(r_.i256, r_.i256); + a_.i256 = __lasx_xvshuf_b(a_.i256, a_.i256, __lasx_xvand_v(b_.i256, __lasx_xvreplgr2vr_b(15))); + r_.i256 = __lasx_xvand_v(r_.i256, a_.i256); + #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -3970,6 +4252,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_shuffle_epi32(a, imm8) __lasx_xvshuf4i_w(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_shuffle_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -3977,11 +4261,11 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i32 = \ SIMDE_SHUFFLE_VECTOR_(32, 32, \ - (simde__tmp_a_).i32, \ - (simde__tmp_a_).i32, \ + (simde_tmp_a_).i32, \ + (simde_tmp_a_).i32, \ ((imm8) ) & 3, \ ((imm8) >> 2) & 3, \ ((imm8) >> 4) & 3, \ @@ -3998,6 +4282,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_shufflehi_epi16(a, imm8) __lasx_xvextrins_d(__lasx_xvshuf4i_h(a, imm8), a, 0) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_shufflehi_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4005,11 +4291,11 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ 0, 1, 2, 3, \ (((imm8) ) & 3) + 4, \ (((imm8) >> 2) & 3) + 4, \ @@ -4034,6 +4320,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_shufflelo_epi16(a, imm8) __lasx_xvextrins_d(__lasx_xvshuf4i_h(a, imm8), a, 0x11) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_shufflelo_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4041,11 +4329,11 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ (((imm8) ) & 3), \ (((imm8) >> 2) & 3), \ (((imm8) >> 4) & 3), \ @@ -4072,6 +4360,8 @@ simde__m256i simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sign_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsigncov_b(b, a); #else simde__m256i_private r_, @@ -4080,7 +4370,7 @@ simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i]; } return simde__m256i_from_private(r_); @@ -4096,6 +4386,8 @@ simde__m256i simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sign_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsigncov_h(b, a); #else simde__m256i_private r_, @@ -4104,7 +4396,7 @@ simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i]; } return simde__m256i_from_private(r_); @@ -4120,6 +4412,8 @@ simde__m256i simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sign_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsigncov_w(b, a); #else simde__m256i_private r_, @@ -4128,7 +4422,7 @@ simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; + r_.i32[i] = (b_.i32[i] == INT32_C(0)) ? INT32_C(0) : (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; } return simde__m256i_from_private(r_); @@ -4144,6 +4438,8 @@ simde__m256i simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi16(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(count[0])); #else simde__m256i_private r_, @@ -4183,6 +4479,8 @@ simde__m256i simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi32(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(count[0])); #else simde__m256i_private r_, @@ -4222,6 +4520,8 @@ simde__m256i simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi64(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(count[0])); #else simde__m256i_private r_, @@ -4273,6 +4573,8 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) { r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16i16)a_.i256 << imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8); #else @@ -4310,6 +4612,8 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) { r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8i32)a_.i256 << imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8); #else @@ -4342,7 +4646,9 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) r_, a_ = simde__m256i_to_private(a); -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v4i64)a_.i256 << imm8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8); #else SIMDE_VECTORIZE @@ -4386,6 +4692,8 @@ simde_mm256_slli_si256 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_slli_si256(a, imm8) __lasx_xvbsll_v(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_slli_si256(a, imm8) \ simde_mm256_set_m128i( \ @@ -4413,6 +4721,9 @@ simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32)); r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vand_v(r_.lsx_i64, __lsx_vslei_wu(b_.lsx_i64, 31)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < UINT32_C(32))) & (a_.u32 << b_.u32); #else @@ -4440,7 +4751,10 @@ simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) { b_ = simde__m256i_to_private(b), r_; - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsll_w(a_.i256, b_.i256); + r_.i256 = __lasx_xvand_v(r_.i256, __lasx_xvslei_wu(b_.i256, 31)); + #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) @@ -4473,6 +4787,9 @@ simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64)); r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64))); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vand_v(r_.lsx_i64, __lsx_vsle_du(b_.lsx_i64, __lsx_vreplgr2vr_d(63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); #else @@ -4500,7 +4817,10 @@ simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) { b_ = simde__m256i_to_private(b), r_; - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsll_d(a_.i256, b_.i256); + r_.i256 = __lasx_xvand_v(r_.i256, __lasx_xvsle_du(b_.i256, __lasx_xvreplgr2vr_d(63))); + #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) @@ -4527,6 +4847,8 @@ simde__m256i simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sra_epi16(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h(count[0] > 15 ? 15 : count[0])); #else simde__m256i_private r_, @@ -4566,6 +4888,8 @@ simde__m256i simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sra_epi32(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w(count[0] > 31 ? 31 : count[0])); #else simde__m256i_private r_, @@ -4610,7 +4934,9 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) if (shift > 15) shift = 15; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16i16)a_.i256 >> shift); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); #else SIMDE_VECTORIZE @@ -4645,7 +4971,9 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) if (shift > 31) shift = 31; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8i32)a_.i256 >> shift); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); #else SIMDE_VECTORIZE @@ -4674,6 +5002,8 @@ simde__m128i simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_srav_epi32(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lsx_vsra_w(a, __lsx_vmini_wu(count, 31)); #else simde__m128i_private r_, @@ -4704,6 +5034,8 @@ simde__m256i simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srav_epi32(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_w(a, __lasx_xvmini_wu(count, 31)); #else simde__m256i_private r_, @@ -4735,6 +5067,8 @@ simde__m256i simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi16(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(count[0] > 16 ? 16 : count[0])); #else simde__m256i_private r_, @@ -4772,6 +5106,8 @@ simde__m256i simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi32(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(count[0] > 32 ? 32 : count[0])); #else simde__m256i_private r_, @@ -4809,6 +5145,8 @@ simde__m256i simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi64(a, count); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(count[0] > 64 ? 64 : count[0])); #else simde__m256i_private r_, @@ -4857,6 +5195,8 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16u16)a_.i256 >> imm8); #else if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) { simde_memset(&r_, 0, sizeof(r_)); @@ -4900,6 +5240,8 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) { r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8u32)a_.i256 >> imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); #else @@ -4932,7 +5274,9 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) r_, a_ = simde__m256i_to_private(a); -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v4u64)a_.i256 >> imm8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8); #else SIMDE_VECTORIZE @@ -4976,6 +5320,8 @@ simde_mm256_srli_si256 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) +# define simde_mm256_srli_si256(a, imm8) __lasx_xvbsrl_v(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_srli_si256(a, imm8) \ simde_mm256_set_m128i( \ @@ -5013,6 +5359,8 @@ simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX2_NATIVE) #define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm_srlv_epi32(a, b) __lsx_vand_v(__lsx_vsrl_w(a, b), __lsx_vslei_wu(b, 31)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm_srlv_epi32 @@ -5040,6 +5388,8 @@ simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) { } #if defined(SIMDE_X86_AVX2_NATIVE) #define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_srlv_epi32(a, b) __lasx_xvand_v(__lasx_xvsrl_w(a, b), __lasx_xvslei_wu(b, 31)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_srlv_epi32 @@ -5067,6 +5417,8 @@ simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX2_NATIVE) #define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm_srlv_epi64(a, b) __lsx_vand_v(__lsx_vsrl_d(a, b), __lsx_vsle_du(b, __lsx_vreplgr2vr_d(63))) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm_srlv_epi64 @@ -5094,6 +5446,8 @@ simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) { } #if defined(SIMDE_X86_AVX2_NATIVE) #define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b) +#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #define simde_mm256_srlv_epi64(a, b) __lasx_xvand_v(__lasx_xvsrl_d(a, b), __lasx_xvsle_du(b, __lasx_xvreplgr2vr_d(63))) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_srlv_epi64 @@ -5105,6 +5459,10 @@ simde__m256i simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvld(mem_addr, 0); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + return __builtin_nontemporal_load(mem_addr); #else simde__m256i r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); @@ -5120,6 +5478,8 @@ simde__m256i simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sub_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsub_b(a, b); #else simde__m256i_private r_, @@ -5151,6 +5511,8 @@ simde__m256i simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sub_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsub_h(a, b); #else simde__m256i_private r_, @@ -5196,6 +5558,8 @@ simde__m256i simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sub_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsub_w(a, b); #else simde__m256i_private r_, @@ -5241,6 +5605,8 @@ simde__m256i simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sub_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsub_d(a, b); #else simde__m256i_private r_, @@ -5295,6 +5661,8 @@ simde__m256i simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_subs_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssub_b(a, b); #else simde__m256i_private r_, @@ -5324,6 +5692,8 @@ simde__m256i simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_subs_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssub_h(a, b); #else simde__m256i_private r_, @@ -5367,6 +5737,8 @@ simde__m256i simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_subs_epu8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssub_bu(a, b); #else simde__m256i_private r_, @@ -5396,6 +5768,8 @@ simde__m256i simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_subs_epu16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvssub_hu(a, b); #else simde__m256i_private r_, @@ -5442,6 +5816,8 @@ simde__m256i simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpacklo_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvl_b(b, a); #else simde__m256i_private r_, @@ -5478,6 +5854,8 @@ simde__m256i simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpacklo_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvl_h(b, a); #else simde__m256i_private r_, @@ -5511,6 +5889,8 @@ simde__m256i simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpacklo_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvl_w(b, a); #else simde__m256i_private r_, @@ -5544,6 +5924,8 @@ simde__m256i simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpacklo_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvl_d(b, a); #else simde__m256i_private r_, @@ -5576,6 +5958,8 @@ simde__m256i simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpackhi_epi8(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvh_b(b, a); #else simde__m256i_private r_, @@ -5612,6 +5996,8 @@ simde__m256i simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpackhi_epi16(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvh_h(b, a); #else simde__m256i_private r_, @@ -5646,6 +6032,8 @@ simde__m256i simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpackhi_epi32(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvh_w(b, a); #else simde__m256i_private r_, @@ -5679,6 +6067,8 @@ simde__m256i simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_unpackhi_epi64(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvilvh_d(b, a); #else simde__m256i_private r_, @@ -5711,6 +6101,8 @@ simde__m256i simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_xor_si256(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvxor_v(a, b); #else simde__m256i_private r_, diff --git a/x86/avx512.h b/x86/avx512.h index 1215e8e4e..e3654bc37 100644 --- a/x86/avx512.h +++ b/x86/avx512.h @@ -55,6 +55,7 @@ #include "avx512/cvt.h" #include "avx512/cvtt.h" #include "avx512/cvts.h" +#include "avx512/cvtus.h" #include "avx512/dbsad.h" #include "avx512/div.h" #include "avx512/dpbf16.h" @@ -68,11 +69,17 @@ #include "avx512/fixupimm_round.h" #include "avx512/flushsubnormal.h" #include "avx512/fmadd.h" +#include "avx512/fmaddsub.h" #include "avx512/fmsub.h" #include "avx512/fnmadd.h" #include "avx512/fnmsub.h" +#include "avx512/fpclass.h" +#include "avx512/gather.h" #include "avx512/insert.h" +#include "avx512/kand.h" #include "avx512/kshift.h" +#include "avx512/knot.h" +#include "avx512/kxor.h" #include "avx512/load.h" #include "avx512/loadu.h" #include "avx512/lzcnt.h" @@ -92,11 +99,14 @@ #include "avx512/or.h" #include "avx512/packs.h" #include "avx512/packus.h" +#include "avx512/permutex.h" #include "avx512/permutexvar.h" #include "avx512/permutex2var.h" #include "avx512/popcnt.h" #include "avx512/range.h" #include "avx512/range_round.h" +#include "avx512/rcp.h" +#include "avx512/reduce.h" #include "avx512/rol.h" #include "avx512/rolv.h" #include "avx512/ror.h" diff --git a/x86/avx512/2intersect.h b/x86/avx512/2intersect.h index 66884f1dd..81b0ee1fb 100644 --- a/x86/avx512/2intersect.h +++ b/x86/avx512/2intersect.h @@ -37,36 +37,35 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES void simde_mm_2intersect_epi32(simde__m128i a, simde__m128i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm_2intersect_epi32(a, b, k1, k2); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { - const int32_t m = a_.i32[i] == b_.i32[j]; - k1_ |= m << i; - k2_ |= m << j; - } + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 + k1_ = 0, + k2_ = 0; + + for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { + #if defined(SIMDE_ENABLE_OPENMP) + #pragma omp simd reduction(|:k1_) reduction(|:k2_) + #else + SIMDE_VECTORIZE + #endif + for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { + const int32_t m = a_.i32[i] == b_.i32[j]; + k1_ |= m << i; + k2_ |= m << j; } + } - *k1 = k1_; - *k2 = k2_; - #endif + *k1 = k1_; + *k2 = k2_; } +#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_2intersect_epi32(a, b, k1, k2) _mm_2intersect_epi32(a, b, k1, k2) +#endif #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef __mm_2intersect_epi32 - #define __mm_2intersect_epi32(a,b, k1, k2) simde_mm_2intersect_epi32(a, b, k1, k2) + #undef _mm_2intersect_epi32 + #define _mm_2intersect_epi32(a, b, k1, k2) simde_mm_2intersect_epi32(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -100,8 +99,8 @@ simde_mm_2intersect_epi64(simde__m128i a, simde__m128i b, simde__mmask8 *k1, sim #endif } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef __mm_2intersect_epi64 - #define __mm_2intersect_epi64(a,b, k1, k2) simde_mm_2intersect_epi64(a, b, k1, k2) + #undef _mm_2intersect_epi64 + #define _mm_2intersect_epi64(a, b, k1, k2) simde_mm_2intersect_epi64(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -136,7 +135,7 @@ simde_mm256_2intersect_epi32(simde__m256i a, simde__m256i b, simde__mmask8 *k1, } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_2intersect_epi32 - #define _mm256_2intersect_epi32(a,b, k1, k2) simde_mm256_2intersect_epi32(a, b, k1, k2) + #define _mm256_2intersect_epi32(a, b, k1, k2) simde_mm256_2intersect_epi32(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -171,7 +170,7 @@ simde_mm256_2intersect_epi64(simde__m256i a, simde__m256i b, simde__mmask8 *k1, } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_2intersect_epi64 - #define _mm256_2intersect_epi64(a,b, k1, k2) simde_mm256_2intersect_epi64(a, b, k1, k2) + #define _mm256_2intersect_epi64(a, b, k1, k2) simde_mm256_2intersect_epi64(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/x86/avx512/abs.h b/x86/avx512/abs.h index 5c0871b75..5ff001485 100644 --- a/x86/avx512/abs.h +++ b/x86/avx512/abs.h @@ -524,7 +524,7 @@ simde_mm512_mask_abs_ps(simde__m512 src, simde__mmask16 k, simde__m512 v2) { SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_abs_pd(simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_87467) return _mm512_abs_pd(v2); #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ @@ -560,7 +560,7 @@ simde_mm512_abs_pd(simde__m512d v2) { SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_abs_pd(simde__m512d src, simde__mmask8 k, simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_87467) return _mm512_mask_abs_pd(src, k, v2); #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ diff --git a/x86/avx512/add.h b/x86/avx512/add.h index 2c4c98e6c..d192b2f57 100644 --- a/x86/avx512/add.h +++ b/x86/avx512/add.h @@ -402,23 +402,7 @@ simde_mm512_add_epi32 (simde__m512i a, simde__m512i b) { a_ = simde__m512i_to_private(a), b_ = simde__m512i_to_private(b); - #if defined(SIMDE_ARM_SVE_NATIVE) - const size_t n = sizeof(a_.i32) / sizeof(a_.i32[0]); - size_t i = 0; - svbool_t pg = svwhilelt_b32(i, n); - do { - svint32_t - va = svld1_s32(pg, &(a_.i32[i])), - vb = svld1_s32(pg, &(b_.i32[i])); - svst1_s32(pg, &(r_.i32[i]), svadd_s32_x(pg, va, vb)); - i += svcntw(); - pg = svwhilelt_b32(i, n); - } while (svptest_any(svptrue_b32(), pg)); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 + b_.i32; #else SIMDE_VECTORIZE diff --git a/x86/avx512/cast.h b/x86/avx512/cast.h index 5c4cafa5f..7f67a5730 100644 --- a/x86/avx512/cast.h +++ b/x86/avx512/cast.h @@ -100,6 +100,39 @@ simde_mm512_castps_si512 (simde__m512 a) { #define _mm512_castps_si512(a) simde_mm512_castps_si512(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_castph_si512 (simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_castph_si512(a); + #else + simde__m512i r; + simde_memcpy(&r, &a, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_castph_si512 + #define _mm512_castph_si512(a) simde_mm512_castph_si512(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_castsi512_ph (simde__m512i a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_castsi512_ph(a); + #else + simde__m512h r; + simde_memcpy(&r, &a, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_castsi512_ph + #define _mm512_castsi512_ph(a) simde_mm512_castsi512_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_castsi512_ps (simde__m512i a) { diff --git a/x86/avx512/cmp.h b/x86/avx512/cmp.h index 313d8bcb2..a09cac538 100644 --- a/x86/avx512/cmp.h +++ b/x86/avx512/cmp.h @@ -38,6 +38,208 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_cmp_epi8_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 <= b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 != b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 < b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = !(a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 <= b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = !(a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi8_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epi8_mask(a, b, imm8) _mm512_cmp_epi8_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi8_mask + #define _mm512_cmp_epi8_mask(a, b, imm8) simde_mm512_cmp_epi8_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmp_epi32_mask (simde__m256i a, simde__m256i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_cmp_epi32_mask(a, b, imm8) _mm256_cmp_epi32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmp_epi32_mask + #define _mm256_cmp_epi32_mask(a, b, imm8) simde_mm256_cmp_epi32_mask((a), (b), (imm8)) +#endif + SIMDE_HUGE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) @@ -46,7 +248,11 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) r_, a_ = simde__m512_to_private(a), b_ = simde__m512_to_private(b); - + #if !defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) + for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { + SIMDE_CONSTIFY_32_(simde_mm_cmp_ps, r_.m128[i], simde_mm_undefined_ps(), imm8, a_.m128[i], b_.m128[i]); + } + #else switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: @@ -229,7 +435,7 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) default: HEDLEY_UNREACHABLE(); } - + #endif return simde_mm512_movepi32_mask(simde_mm512_castps_si512(simde__m512_from_private(r_))); } #if defined(SIMDE_X86_AVX512F_NATIVE) @@ -237,7 +443,7 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_cmp_ps_mask_r_, \ + simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ \ @@ -250,7 +456,7 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_cmp_ps_mask_r_, \ + simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ \ @@ -294,7 +500,11 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) r_, a_ = simde__m512d_to_private(a), b_ = simde__m512d_to_private(b); - + #if !defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) + for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { + SIMDE_CONSTIFY_32_(simde_mm_cmp_pd, r_.m128d[i], simde_mm_undefined_pd(), imm8, a_.m128d[i], b_.m128d[i]); + } + #else switch (imm8) { case SIMDE_CMP_EQ_OQ: case SIMDE_CMP_EQ_OS: @@ -477,7 +687,7 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) default: HEDLEY_UNREACHABLE(); } - + #endif return simde_mm512_movepi64_mask(simde_mm512_castpd_si512(simde__m512d_from_private(r_))); } #if defined(SIMDE_X86_AVX512F_NATIVE) @@ -485,7 +695,7 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_, \ + simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ \ @@ -498,7 +708,7 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_, \ + simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ \ @@ -534,6 +744,978 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #define _mm_cmp_pd_mask(a, b, imm8) simde_mm_cmp_pd_mask((a), (b), (imm8)) #endif +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_ph_mask (simde__m512h a, simde__m512h b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + switch (imm8) { + case SIMDE_CMP_EQ_OQ: + case SIMDE_CMP_EQ_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i]) + && !simde_isnanhf(a_.f16[i]) && !simde_isnanhf(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_LT_OQ: + case SIMDE_CMP_LT_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 < b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_LE_OQ: + case SIMDE_CMP_LE_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 <= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_UNORD_Q: + case SIMDE_CMP_UNORD_S: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_float16_to_float32(a_.f16[i]) != simde_float16_to_float32(a_.f16[i])) + || (simde_float16_to_float32(b_.f16[i]) != simde_float16_to_float32(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NEQ_UQ: + case SIMDE_CMP_NEQ_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) + || simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NEQ_OQ: + case SIMDE_CMP_NEQ_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == a_.f16) & (b_.f16 == b_.f16) & (a_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + !(simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) + && (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NLT_UQ: + case SIMDE_CMP_NLT_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 < b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NLE_UQ: + case SIMDE_CMP_NLE_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 <= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_ORD_Q: + case SIMDE_CMP_ORD_S: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ((a_.f16 == a_.f16) & (b_.f16 == b_.f16))); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) ? INT16_C(0) : ~INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_EQ_UQ: + case SIMDE_CMP_EQ_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16) | (a_.f16 == b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) + || (simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NGE_UQ: + case SIMDE_CMP_NGE_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 >= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NGT_UQ: + case SIMDE_CMP_NGT_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 > b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_FALSE_OQ: + case SIMDE_CMP_FALSE_OS: + r_ = simde__m512h_to_private(simde_mm512_setzero_ph()); + break; + + case SIMDE_CMP_GE_OQ: + case SIMDE_CMP_GE_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 >= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_GT_OQ: + case SIMDE_CMP_GT_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 > b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_TRUE_UQ: + case SIMDE_CMP_TRUE_US: + r_ = simde__m512h_to_private(simde_x_mm512_setone_ph()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde_mm512_castph_si512(simde__m512h_from_private(r_))); +} +#if defined(SIMDE_X86_AVX512FP16_NATIVE) + #define simde_mm512_cmp_ph_mask(a, b, imm8) _mm512_cmp_ph_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_ph_mask + #define _mm512_cmp_ph_mask(a, b, imm8) simde_mm512_cmp_ph_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_epi16_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 == b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 <= b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 != b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 < b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = !(a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 <= b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = !(a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epi16_mask(a, b, imm8) _mm512_cmp_epi16_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi16_mask + #define _mm512_cmp_epi16_mask(a, b, imm8) simde_mm512_cmp_epi16_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epi16_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epi16_mask +#define _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epi16_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmp_epi32_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epi32_mask(a, b, imm8) _mm512_cmp_epi32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi32_mask + #define _mm512_cmp_epi32_mask(a, b, imm8) simde_mm512_cmp_epi32_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmp_epi64_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 == b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 < b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 <= b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 != b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] != b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 < b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = !(a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 <= b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = !(a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epi64_mask(a, b, imm8) _mm512_cmp_epi64_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi64_mask + #define _mm512_cmp_epi64_mask(a, b, imm8) simde_mm512_cmp_epi64_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_epu16_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 == b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 < b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 <= b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 != b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 < b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = !(a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 <= b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = !(a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epu16_mask(a, b, imm8) _mm512_cmp_epu16_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu16_mask + #define _mm512_cmp_epu16_mask(a, b, imm8) simde_mm512_cmp_epu16_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu16_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu16_mask +#define _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu16_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmp_epu32_mask (simde__m256i a, simde__m256i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_cmp_epu32_mask(a, b, imm8) _mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmp_epu32_mask + #define _mm256_cmp_epu32_mask(a, b, imm8) simde_mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) _mm256_mask_cmp_epu32_mask(k1, a, b, imm8) +#else + #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmp_epu32_mask +#define _mm256_mask_cmp_epu32_mask(a, b, imm8) simde_mm256_mask_cmp_epu32_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmp_epu32_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epu32_mask(a, b, imm8) _mm512_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu32_mask + #define _mm512_cmp_epu32_mask(a, b, imm8) simde_mm512_cmp_epu32_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu32_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu32_mask +#define _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu32_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmp_epu64_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 == b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 < b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 <= b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 != b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 < b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = !(a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 <= b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = !(a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epu64_mask(a, b, imm8) _mm512_cmp_epu64_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu64_mask + #define _mm512_cmp_epu64_mask(a, b, imm8) simde_mm512_cmp_epu64_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu64_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu64_mask +#define _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu64_mask((k1), (a), (b), (imm8)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/cmpeq.h b/x86/avx512/cmpeq.h index 148c93184..41f90b3e9 100644 --- a/x86/avx512/cmpeq.h +++ b/x86/avx512/cmpeq.h @@ -167,6 +167,54 @@ simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512 #define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpeq_epu16_mask (simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpeq_epu16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu16_mask + #define _mm512_cmpeq_epu16_mask(a, b) simde_mm512_cmpeq_epu16_mask((a), (b)) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpeq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpeq_epu16_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_epu16_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu16_mask + #define _mm512_mask_cmpeq_epu16_mask(k1, a, b) simde_mm512_mask_cmpeq_epu16_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmpeq_ps_mask (simde__m512 a, simde__m512 b) { diff --git a/x86/avx512/cmpge.h b/x86/avx512/cmpge.h index a94a0c410..d0d428790 100644 --- a/x86/avx512/cmpge.h +++ b/x86/avx512/cmpge.h @@ -78,8 +78,8 @@ simde_mm_cmpge_epi8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi8_mask - #define _mm512_cmpge_epi8_mask(a, b) simde_mm512_cmpge_epi8_mask((a), (b)) + #undef _mm_cmpge_epi8_mask + #define _mm_cmpge_epi8_mask(a, b) simde_mm_cmpge_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -93,7 +93,7 @@ simde_mm_mask_cmpge_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi8_mask - #define _mm_mask_cmpge_epi8_mask(src, k, a, b) simde_mm_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi8_mask(k, a, b) simde_mm_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -134,8 +134,8 @@ simde_mm256_cmpge_epi8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi8_mask - #define _mm512_cmpge_epi8_mask(a, b) simde_mm512_cmpge_epi8_mask((a), (b)) + #undef _mm256_cmpge_epi8_mask + #define _mm256_cmpge_epi8_mask(a, b) simde_mm256_cmpge_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -149,7 +149,7 @@ simde_mm256_mask_cmpge_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi8_mask - #define _mm256_mask_cmpge_epi8_mask(src, k, a, b) simde_mm256_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi8_mask(k, a, b) simde_mm256_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -209,7 +209,7 @@ simde_mm512_mask_cmpge_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi8_mask - #define _mm512_mask_cmpge_epi8_mask(src, k, a, b) simde_mm512_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi8_mask(k, a, b) simde_mm512_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -252,8 +252,8 @@ simde_mm_cmpge_epu8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu8_mask - #define _mm512_cmpge_epu8_mask(a, b) simde_mm512_cmpge_epu8_mask((a), (b)) + #undef _mm_cmpge_epu8_mask + #define _mm_cmpge_epu8_mask(a, b) simde_mm_cmpge_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -267,7 +267,7 @@ simde_mm_mask_cmpge_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu8_mask - #define _mm_mask_cmpge_epu8_mask(src, k, a, b) simde_mm_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu8_mask(k, a, b) simde_mm_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -308,8 +308,8 @@ simde_mm256_cmpge_epu8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu8_mask - #define _mm512_cmpge_epu8_mask(a, b) simde_mm512_cmpge_epu8_mask((a), (b)) + #undef _mm256_cmpge_epu8_mask + #define _mm256_cmpge_epu8_mask(a, b) simde_mm256_cmpge_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -323,7 +323,7 @@ simde_mm256_mask_cmpge_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu8_mask - #define _mm256_mask_cmpge_epu8_mask(src, k, a, b) simde_mm256_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu8_mask(k, a, b) simde_mm256_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -383,7 +383,7 @@ simde_mm512_mask_cmpge_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu8_mask - #define _mm512_mask_cmpge_epu8_mask(src, k, a, b) simde_mm512_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu8_mask(k, a, b) simde_mm512_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -426,8 +426,8 @@ simde_mm_cmpge_epi16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi16_mask - #define _mm512_cmpge_epi16_mask(a, b) simde_mm512_cmpge_epi16_mask((a), (b)) + #undef _mm_cmpge_epi16_mask + #define _mm_cmpge_epi16_mask(a, b) simde_mm_cmpge_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -441,7 +441,7 @@ simde_mm_mask_cmpge_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi16_mask - #define _mm_mask_cmpge_epi16_mask(src, k, a, b) simde_mm_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi16_mask(k, a, b) simde_mm_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -482,8 +482,8 @@ simde_mm256_cmpge_epi16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi16_mask - #define _mm512_cmpge_epi16_mask(a, b) simde_mm512_cmpge_epi16_mask((a), (b)) + #undef _mm256_cmpge_epi16_mask + #define _mm256_cmpge_epi16_mask(a, b) simde_mm256_cmpge_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -497,7 +497,7 @@ simde_mm256_mask_cmpge_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi16_mask - #define _mm256_mask_cmpge_epi16_mask(src, k, a, b) simde_mm256_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi16_mask(k, a, b) simde_mm256_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -557,7 +557,7 @@ simde_mm512_mask_cmpge_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi16_mask - #define _mm512_mask_cmpge_epi16_mask(src, k, a, b) simde_mm512_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi16_mask(k, a, b) simde_mm512_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -600,8 +600,8 @@ simde_mm_cmpge_epu16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu16_mask - #define _mm512_cmpge_epu16_mask(a, b) simde_mm512_cmpge_epu16_mask((a), (b)) + #undef _mm_cmpge_epu16_mask + #define _mm_cmpge_epu16_mask(a, b) simde_mm_cmpge_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -615,7 +615,7 @@ simde_mm_mask_cmpge_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu16_mask - #define _mm_mask_cmpge_epu16_mask(src, k, a, b) simde_mm_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu16_mask(k, a, b) simde_mm_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -656,8 +656,8 @@ simde_mm256_cmpge_epu16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu16_mask - #define _mm512_cmpge_epu16_mask(a, b) simde_mm512_cmpge_epu16_mask((a), (b)) + #undef _mm256_cmpge_epu16_mask + #define _mm256_cmpge_epu16_mask(a, b) simde_mm256_cmpge_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -671,7 +671,7 @@ simde_mm256_mask_cmpge_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu16_mask - #define _mm256_mask_cmpge_epu16_mask(src, k, a, b) simde_mm256_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu16_mask(k, a, b) simde_mm256_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -731,7 +731,7 @@ simde_mm512_mask_cmpge_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu16_mask - #define _mm512_mask_cmpge_epu16_mask(src, k, a, b) simde_mm512_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu16_mask(k, a, b) simde_mm512_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -774,8 +774,8 @@ simde_mm_cmpge_epi32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi32_mask - #define _mm512_cmpge_epi32_mask(a, b) simde_mm512_cmpge_epi32_mask((a), (b)) + #undef _mm_cmpge_epi32_mask + #define _mm_cmpge_epi32_mask(a, b) simde_mm_cmpge_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -789,7 +789,7 @@ simde_mm_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi32_mask - #define _mm_mask_cmpge_epi32_mask(src, k, a, b) simde_mm_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi32_mask(k, a, b) simde_mm_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -830,8 +830,8 @@ simde_mm256_cmpge_epi32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi32_mask - #define _mm512_cmpge_epi32_mask(a, b) simde_mm512_cmpge_epi32_mask((a), (b)) + #undef _mm256_cmpge_epi32_mask + #define _mm256_cmpge_epi32_mask(a, b) simde_mm256_cmpge_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -845,7 +845,7 @@ simde_mm256_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi32_mask - #define _mm256_mask_cmpge_epi32_mask(src, k, a, b) simde_mm256_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi32_mask(k, a, b) simde_mm256_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -905,7 +905,7 @@ simde_mm512_mask_cmpge_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi32_mask - #define _mm512_mask_cmpge_epi32_mask(src, k, a, b) simde_mm512_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi32_mask(k, a, b) simde_mm512_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -948,8 +948,8 @@ simde_mm_cmpge_epu32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu32_mask - #define _mm512_cmpge_epu32_mask(a, b) simde_mm512_cmpge_epu32_mask((a), (b)) + #undef _mm_cmpge_epu32_mask + #define _mm_cmpge_epu32_mask(a, b) simde_mm_cmpge_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -963,7 +963,7 @@ simde_mm_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu32_mask - #define _mm_mask_cmpge_epu32_mask(src, k, a, b) simde_mm_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu32_mask(k, a, b) simde_mm_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1004,8 +1004,8 @@ simde_mm256_cmpge_epu32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu32_mask - #define _mm512_cmpge_epu32_mask(a, b) simde_mm512_cmpge_epu32_mask((a), (b)) + #undef _mm256_cmpge_epu32_mask + #define _mm256_cmpge_epu32_mask(a, b) simde_mm256_cmpge_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1019,7 +1019,7 @@ simde_mm256_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu32_mask - #define _mm256_mask_cmpge_epu32_mask(src, k, a, b) simde_mm256_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu32_mask(k, a, b) simde_mm256_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1079,7 +1079,7 @@ simde_mm512_mask_cmpge_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu32_mask - #define _mm512_mask_cmpge_epu32_mask(src, k, a, b) simde_mm512_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu32_mask(k, a, b) simde_mm512_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1137,7 +1137,7 @@ simde_mm_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi64_mask - #define _mm_mask_cmpge_epi64_mask(src, k, a, b) simde_mm_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi64_mask(k, a, b) simde_mm_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1193,7 +1193,7 @@ simde_mm256_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi64_mask - #define _mm256_mask_cmpge_epi64_mask(src, k, a, b) simde_mm256_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi64_mask(k, a, b) simde_mm256_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1253,7 +1253,7 @@ simde_mm512_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi64_mask - #define _mm512_mask_cmpge_epi64_mask(src, k, a, b) simde_mm512_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi64_mask(k, a, b) simde_mm512_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1294,8 +1294,8 @@ simde_mm_cmpge_epu64_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu64_mask - #define _mm512_cmpge_epu64_mask(a, b) simde_mm512_cmpge_epu64_mask((a), (b)) + #undef _mm_cmpge_epu64_mask + #define _mm_cmpge_epu64_mask(a, b) simde_mm_cmpge_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1309,7 +1309,7 @@ simde_mm_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu64_mask - #define _mm_mask_cmpge_epu64_mask(src, k, a, b) simde_mm_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu64_mask(k, a, b) simde_mm_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1350,8 +1350,8 @@ simde_mm256_cmpge_epu64_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu64_mask - #define _mm512_cmpge_epu64_mask(a, b) simde_mm512_cmpge_epu64_mask((a), (b)) + #undef _mm256_cmpge_epu64_mask + #define _mm256_cmpge_epu64_mask(a, b) simde_mm256_cmpge_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1365,7 +1365,7 @@ simde_mm256_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu64_mask - #define _mm256_mask_cmpge_epu64_mask(src, k, a, b) simde_mm256_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu64_mask(k, a, b) simde_mm256_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1425,7 +1425,7 @@ simde_mm512_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu64_mask - #define _mm512_mask_cmpge_epu64_mask(src, k, a, b) simde_mm512_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu64_mask(k, a, b) simde_mm512_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/cmpgt.h b/x86/avx512/cmpgt.h index 2894df9bb..15245f968 100644 --- a/x86/avx512/cmpgt.h +++ b/x86/avx512/cmpgt.h @@ -109,6 +109,29 @@ simde_mm512_cmpgt_epu8_mask (simde__m512i a, simde__m512i b) { #define _mm512_cmpgt_epu8_mask(a, b) simde_mm512_cmpgt_epu8_mask(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpgt_epi16_mask (simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpgt_epi16_mask(a, b); + #else + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { + r_.m256i[i] = simde_mm256_cmpgt_epi16(a_.m256i[i], b_.m256i[i]); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpgt_epi16_mask + #define _mm512_cmpgt_epi16_mask(a, b) simde_mm512_cmpgt_epi16_mask(a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmpgt_epi32_mask (simde__m512i a, simde__m512i b) { diff --git a/x86/avx512/cmple.h b/x86/avx512/cmple.h index c83227f48..9b3c3aad2 100644 --- a/x86/avx512/cmple.h +++ b/x86/avx512/cmple.h @@ -76,8 +76,8 @@ simde_mm_cmple_epi8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi8_mask - #define _mm512_cmple_epi8_mask(a, b) simde_mm512_cmple_epi8_mask((a), (b)) + #undef _mm_cmple_epi8_mask + #define _mm_cmple_epi8_mask(a, b) simde_mm_cmple_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -91,7 +91,7 @@ simde_mm_mask_cmple_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi8_mask - #define _mm_mask_cmple_epi8_mask(src, k, a, b) simde_mm_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi8_mask(k, a, b) simde_mm_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -132,8 +132,8 @@ simde_mm256_cmple_epi8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi8_mask - #define _mm512_cmple_epi8_mask(a, b) simde_mm512_cmple_epi8_mask((a), (b)) + #undef _mm256_cmple_epi8_mask + #define _mm256_cmple_epi8_mask(a, b) simde_mm256_cmple_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -147,7 +147,7 @@ simde_mm256_mask_cmple_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi8_mask - #define _mm256_mask_cmple_epi8_mask(src, k, a, b) simde_mm256_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi8_mask(k, a, b) simde_mm256_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -207,7 +207,7 @@ simde_mm512_mask_cmple_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi8_mask - #define _mm512_mask_cmple_epi8_mask(src, k, a, b) simde_mm512_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi8_mask(k, a, b) simde_mm512_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -250,8 +250,8 @@ simde_mm_cmple_epu8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu8_mask - #define _mm512_cmple_epu8_mask(a, b) simde_mm512_cmple_epu8_mask((a), (b)) + #undef _mm_cmple_epu8_mask + #define _mm_cmple_epu8_mask(a, b) simde_mm_cmple_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -265,7 +265,7 @@ simde_mm_mask_cmple_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu8_mask - #define _mm_mask_cmple_epu8_mask(src, k, a, b) simde_mm_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu8_mask(k, a, b) simde_mm_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -306,8 +306,8 @@ simde_mm256_cmple_epu8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu8_mask - #define _mm512_cmple_epu8_mask(a, b) simde_mm512_cmple_epu8_mask((a), (b)) + #undef _mm256_cmple_epu8_mask + #define _mm256_cmple_epu8_mask(a, b) simde_mm256_cmple_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -321,7 +321,7 @@ simde_mm256_mask_cmple_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu8_mask - #define _mm256_mask_cmple_epu8_mask(src, k, a, b) simde_mm256_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu8_mask(k, a, b) simde_mm256_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -381,7 +381,7 @@ simde_mm512_mask_cmple_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu8_mask - #define _mm512_mask_cmple_epu8_mask(src, k, a, b) simde_mm512_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu8_mask(k, a, b) simde_mm512_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -424,8 +424,8 @@ simde_mm_cmple_epi16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi16_mask - #define _mm512_cmple_epi16_mask(a, b) simde_mm512_cmple_epi16_mask((a), (b)) + #undef _mm_cmple_epi16_mask + #define _mm_cmple_epi16_mask(a, b) simde_mm_cmple_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -439,7 +439,7 @@ simde_mm_mask_cmple_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi16_mask - #define _mm_mask_cmple_epi16_mask(src, k, a, b) simde_mm_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi16_mask(k, a, b) simde_mm_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -480,8 +480,8 @@ simde_mm256_cmple_epi16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi16_mask - #define _mm512_cmple_epi16_mask(a, b) simde_mm512_cmple_epi16_mask((a), (b)) + #undef _mm256_cmple_epi16_mask + #define _mm256_cmple_epi16_mask(a, b) simde_mm256_cmple_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -495,7 +495,7 @@ simde_mm256_mask_cmple_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi16_mask - #define _mm256_mask_cmple_epi16_mask(src, k, a, b) simde_mm256_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi16_mask(k, a, b) simde_mm256_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -555,7 +555,7 @@ simde_mm512_mask_cmple_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi16_mask - #define _mm512_mask_cmple_epi16_mask(src, k, a, b) simde_mm512_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi16_mask(k, a, b) simde_mm512_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -598,8 +598,8 @@ simde_mm_cmple_epu16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu16_mask - #define _mm512_cmple_epu16_mask(a, b) simde_mm512_cmple_epu16_mask((a), (b)) + #undef _mm_cmple_epu16_mask + #define _mm_cmple_epu16_mask(a, b) simde_mm_cmple_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -613,7 +613,7 @@ simde_mm_mask_cmple_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu16_mask - #define _mm_mask_cmple_epu16_mask(src, k, a, b) simde_mm_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu16_mask(k, a, b) simde_mm_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -654,8 +654,8 @@ simde_mm256_cmple_epu16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu16_mask - #define _mm512_cmple_epu16_mask(a, b) simde_mm512_cmple_epu16_mask((a), (b)) + #undef _mm256_cmple_epu16_mask + #define _mm256_cmple_epu16_mask(a, b) simde_mm256_cmple_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -669,7 +669,7 @@ simde_mm256_mask_cmple_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu16_mask - #define _mm256_mask_cmple_epu16_mask(src, k, a, b) simde_mm256_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu16_mask(k, a, b) simde_mm256_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -729,7 +729,7 @@ simde_mm512_mask_cmple_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu16_mask - #define _mm512_mask_cmple_epu16_mask(src, k, a, b) simde_mm512_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu16_mask(k, a, b) simde_mm512_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -772,8 +772,8 @@ simde_mm_cmple_epi32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi32_mask - #define _mm512_cmple_epi32_mask(a, b) simde_mm512_cmple_epi32_mask((a), (b)) + #undef _mm_cmple_epi32_mask + #define _mm_cmple_epi32_mask(a, b) simde_mm_cmple_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -787,7 +787,7 @@ simde_mm_mask_cmple_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi32_mask - #define _mm_mask_cmple_epi32_mask(src, k, a, b) simde_mm_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi32_mask(k, a, b) simde_mm_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -828,8 +828,8 @@ simde_mm256_cmple_epi32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi32_mask - #define _mm512_cmple_epi32_mask(a, b) simde_mm512_cmple_epi32_mask((a), (b)) + #undef _mm256_cmple_epi32_mask + #define _mm256_cmple_epi32_mask(a, b) simde_mm256_cmple_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -843,7 +843,7 @@ simde_mm256_mask_cmple_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi32_mask - #define _mm256_mask_cmple_epi32_mask(src, k, a, b) simde_mm256_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi32_mask(k, a, b) simde_mm256_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -903,7 +903,7 @@ simde_mm512_mask_cmple_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi32_mask - #define _mm512_mask_cmple_epi32_mask(src, k, a, b) simde_mm512_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi32_mask(k, a, b) simde_mm512_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -946,8 +946,8 @@ simde_mm_cmple_epu32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu32_mask - #define _mm512_cmple_epu32_mask(a, b) simde_mm512_cmple_epu32_mask((a), (b)) + #undef _mm_cmple_epu32_mask + #define _mm_cmple_epu32_mask(a, b) simde_mm_cmple_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -961,7 +961,7 @@ simde_mm_mask_cmple_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu32_mask - #define _mm_mask_cmple_epu32_mask(src, k, a, b) simde_mm_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu32_mask(k, a, b) simde_mm_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1002,8 +1002,8 @@ simde_mm256_cmple_epu32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu32_mask - #define _mm512_cmple_epu32_mask(a, b) simde_mm512_cmple_epu32_mask((a), (b)) + #undef _mm256_cmple_epu32_mask + #define _mm256_cmple_epu32_mask(a, b) simde_mm256_cmple_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1017,7 +1017,7 @@ simde_mm256_mask_cmple_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu32_mask - #define _mm256_mask_cmple_epu32_mask(src, k, a, b) simde_mm256_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu32_mask(k, a, b) simde_mm256_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1077,7 +1077,7 @@ simde_mm512_mask_cmple_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu32_mask - #define _mm512_mask_cmple_epu32_mask(src, k, a, b) simde_mm512_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu32_mask(k, a, b) simde_mm512_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1135,7 +1135,7 @@ simde_mm_mask_cmple_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi64_mask - #define _mm_mask_cmple_epi64_mask(src, k, a, b) simde_mm_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi64_mask(k, a, b) simde_mm_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1191,7 +1191,7 @@ simde_mm256_mask_cmple_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi64_mask - #define _mm256_mask_cmple_epi64_mask(src, k, a, b) simde_mm256_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi64_mask(k, a, b) simde_mm256_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1251,7 +1251,7 @@ simde_mm512_mask_cmple_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi64_mask - #define _mm512_mask_cmple_epi64_mask(src, k, a, b) simde_mm512_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi64_mask(k, a, b) simde_mm512_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1292,8 +1292,8 @@ simde_mm_cmple_epu64_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu64_mask - #define _mm512_cmple_epu64_mask(a, b) simde_mm512_cmple_epu64_mask((a), (b)) + #undef _mm_cmple_epu64_mask + #define _mm_cmple_epu64_mask(a, b) simde_mm_cmple_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1307,7 +1307,7 @@ simde_mm_mask_cmple_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu64_mask - #define _mm_mask_cmple_epu64_mask(src, k, a, b) simde_mm_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu64_mask(k, a, b) simde_mm_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1348,8 +1348,8 @@ simde_mm256_cmple_epu64_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu64_mask - #define _mm512_cmple_epu64_mask(a, b) simde_mm512_cmple_epu64_mask((a), (b)) + #undef _mm256_cmple_epu64_mask + #define _mm256_cmple_epu64_mask(a, b) simde_mm256_cmple_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1363,7 +1363,7 @@ simde_mm256_mask_cmple_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu64_mask - #define _mm256_mask_cmple_epu64_mask(src, k, a, b) simde_mm256_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu64_mask(k, a, b) simde_mm256_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1423,7 +1423,7 @@ simde_mm512_mask_cmple_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu64_mask - #define _mm512_mask_cmple_epu64_mask(src, k, a, b) simde_mm512_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu64_mask(k, a, b) simde_mm512_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/cmpneq.h b/x86/avx512/cmpneq.h index 6583155dd..6e9bf3364 100644 --- a/x86/avx512/cmpneq.h +++ b/x86/avx512/cmpneq.h @@ -61,7 +61,7 @@ simde_mm_mask_cmpneq_epi8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi8_mask - #define _mm_mask_cmpneq_epi8_mask(a, b) simde_mm_mask_cmpneq_epi8_mask((a), (b)) + #define _mm_mask_cmpneq_epi8_mask(k1, a, b) simde_mm_mask_cmpneq_epi8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm_mask_cmpneq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu8_mask - #define _mm_mask_cmpneq_epu8_mask(a, b) simde_mm_mask_cmpneq_epu8_mask((a), (b)) + #define _mm_mask_cmpneq_epu8_mask(k1, a, b) simde_mm_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -117,7 +117,7 @@ simde_mm_mask_cmpneq_epi16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi16_mask - #define _mm_mask_cmpneq_epi16_mask(a, b) simde_mm_mask_cmpneq_epi16_mask((a), (b)) + #define _mm_mask_cmpneq_epi16_mask(k1, a, b) simde_mm_mask_cmpneq_epi16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -145,7 +145,7 @@ simde_mm_mask_cmpneq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu16_mask - #define _mm_mask_cmpneq_epu16_mask(a, b) simde_mm_mask_cmpneq_epu16_mask((a), (b)) + #define _mm_mask_cmpneq_epu16_mask(k1, a, b) simde_mm_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -173,7 +173,7 @@ simde_mm_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi32_mask - #define _mm_mask_cmpneq_epi32_mask(a, b) simde_mm_mask_cmpneq_epi32_mask((a), (b)) + #define _mm_mask_cmpneq_epi32_mask(k1, a, b) simde_mm_mask_cmpneq_epi32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -201,7 +201,7 @@ simde_mm_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu32_mask - #define _mm_mask_cmpneq_epu32_mask(a, b) simde_mm_mask_cmpneq_epu32_mask((a), (b)) + #define _mm_mask_cmpneq_epu32_mask(k1, a, b) simde_mm_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -229,7 +229,7 @@ simde_mm_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi64_mask - #define _mm_mask_cmpneq_epi64_mask(a, b) simde_mm_mask_cmpneq_epi64_mask((a), (b)) + #define _mm_mask_cmpneq_epi64_mask(k1, a, b) simde_mm_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -257,7 +257,7 @@ simde_mm_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu64_mask - #define _mm_mask_cmpneq_epu64_mask(a, b) simde_mm_mask_cmpneq_epu64_mask((a), (b)) + #define _mm_mask_cmpneq_epu64_mask(k1, a, b) simde_mm_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -285,7 +285,7 @@ simde_mm256_mask_cmpneq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi8_mask - #define _mm256_mask_cmpneq_epi8_mask(a, b) simde_mm256_mask_cmpneq_epi8_mask((a), (b)) + #define _mm256_mask_cmpneq_epi8_mask(k1, a, b) simde_mm256_mask_cmpneq_epi8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -313,7 +313,7 @@ simde_mm256_mask_cmpneq_epu8_mask(simde__mmask32 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu8_mask - #define _mm256_mask_cmpneq_epu8_mask(a, b) simde_mm256_mask_cmpneq_epu8_mask((a), (b)) + #define _mm256_mask_cmpneq_epu8_mask(k1, a, b) simde_mm256_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -341,7 +341,7 @@ simde_mm256_mask_cmpneq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi16_mask - #define _mm256_mask_cmpneq_epi16_mask(a, b) simde_mm256_mask_cmpneq_epi16_mask((a), (b)) + #define _mm256_mask_cmpneq_epi16_mask(k1, a, b) simde_mm256_mask_cmpneq_epi16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -369,7 +369,7 @@ simde_mm256_mask_cmpneq_epu16_mask(simde__mmask16 k1, simde__m256i a, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu16_mask - #define _mm256_mask_cmpneq_epu16_mask(a, b) simde_mm256_mask_cmpneq_epu16_mask((a), (b)) + #define _mm256_mask_cmpneq_epu16_mask(k1, a, b) simde_mm256_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -397,7 +397,7 @@ simde_mm256_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi32_mask - #define _mm256_mask_cmpneq_epi32_mask(a, b) simde_mm256_mask_cmpneq_epi32_mask((a), (b)) + #define _mm256_mask_cmpneq_epi32_mask(k1, a, b) simde_mm256_mask_cmpneq_epi32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -425,7 +425,7 @@ simde_mm256_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu32_mask - #define _mm256_mask_cmpneq_epu32_mask(a, b) simde_mm256_mask_cmpneq_epu32_mask((a), (b)) + #define _mm256_mask_cmpneq_epu32_mask(k1, a, b) simde_mm256_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -453,7 +453,7 @@ simde_mm256_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi64_mask - #define _mm256_mask_cmpneq_epi64_mask(a, b) simde_mm256_mask_cmpneq_epi64_mask((a), (b)) + #define _mm256_mask_cmpneq_epi64_mask(k1, a, b) simde_mm256_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -481,7 +481,7 @@ simde_mm256_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu64_mask - #define _mm256_mask_cmpneq_epu64_mask(a, b) simde_mm256_mask_cmpneq_epu64_mask((a), (b)) + #define _mm256_mask_cmpneq_epu64_mask(k1, a, b) simde_mm256_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/compress.h b/x86/avx512/compress.h index 1eb6fae45..06fffc733 100644 --- a/x86/avx512/compress.h +++ b/x86/avx512/compress.h @@ -34,14 +34,17 @@ simde_mm256_mask_compress_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_pd - #define _mm256_mask_compress_pd(src, k, a) _mm256_mask_compress_pd(src, k, a) + #define _mm256_mask_compress_pd(src, k, a) simde_mm256_mask_compress_pd(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_pd(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_pd(base_addr, store_mask, _mm256_maskz_compress_pd(k, a)); #else simde__m256d_private a_ = simde__m256d_to_private(a); @@ -61,7 +64,7 @@ simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_pd - #define _mm256_mask_compressstoreu_pd(base_addr, k, a) _mm256_mask_compressstoreu_pd(base_addr, k, a) + #define _mm256_mask_compressstoreu_pd(base_addr, k, a) simde_mm256_mask_compressstoreu_pd(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -90,7 +93,7 @@ simde_mm256_maskz_compress_pd (simde__mmask8 k, simde__m256d a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_pd - #define _mm256_maskz_compress_pd(k, a) _mm256_maskz_compress_pd(k, a) + #define _mm256_maskz_compress_pd(k, a) simde_mm256_maskz_compress_pd(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -120,14 +123,17 @@ simde_mm256_mask_compress_ps (simde__m256 src, simde__mmask8 k, simde__m256 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_ps - #define _mm256_mask_compress_ps(src, k, a) _mm256_mask_compress_ps(src, k, a) + #define _mm256_mask_compress_ps(src, k, a) simde_mm256_mask_compress_ps(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_ps(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_ps(base_addr, store_mask, _mm256_maskz_compress_ps(k, a)); #else simde__m256_private a_ = simde__m256_to_private(a); @@ -146,8 +152,8 @@ simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m25 #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_pd - #define _mm256_mask_compressstoreu_ps(base_addr, k, a) _mm256_mask_compressstoreu_ps(base_addr, k, a) + #undef _mm256_mask_compressstoreu_ps + #define _mm256_mask_compressstoreu_ps(base_addr, k, a) simde_mm256_mask_compressstoreu_ps(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -176,7 +182,7 @@ simde_mm256_maskz_compress_ps (simde__mmask8 k, simde__m256 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_ps - #define _mm256_maskz_compress_ps(k, a) _mm256_maskz_compress_ps(k, a) + #define _mm256_maskz_compress_ps(k, a) simde_mm256_maskz_compress_ps(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -206,14 +212,17 @@ simde_mm256_mask_compress_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_epi32 - #define _mm256_mask_compress_epi32(src, k, a) _mm256_mask_compress_epi32(src, k, a) + #define _mm256_mask_compress_epi32(src, k, a) simde_mm256_mask_compress_epi32(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi32(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_epi32(base_addr, store_mask, _mm256_maskz_compress_epi32(k, a)); #else simde__m256i_private a_ = simde__m256i_to_private(a); @@ -233,7 +242,7 @@ simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_epi32 - #define _mm256_mask_compressstoreu_epi32(base_addr, k, a) _mm256_mask_compressstoreu_epi32(base_addr, k, a) + #define _mm256_mask_compressstoreu_epi32(base_addr, k, a) simde_mm256_mask_compressstoreu_epi32(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -262,7 +271,7 @@ simde_mm256_maskz_compress_epi32 (simde__mmask8 k, simde__m256i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_epi32 - #define _mm256_maskz_compress_epi32(k, a) _mm256_maskz_compress_epi32(k, a) + #define _mm256_maskz_compress_epi32(k, a) simde_mm256_maskz_compress_epi32(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -292,14 +301,17 @@ simde_mm256_mask_compress_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_epi64 - #define _mm256_mask_compress_epi64(src, k, a) _mm256_mask_compress_epi64(src, k, a) + #define _mm256_mask_compress_epi64(src, k, a) simde_mm256_mask_compress_epi64(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi64(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_epi64(base_addr, store_mask, _mm256_maskz_compress_epi64(k, a)); #else simde__m256i_private a_ = simde__m256i_to_private(a); @@ -319,7 +331,7 @@ simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_epi64 - #define _mm256_mask_compressstoreu_epi64(base_addr, k, a) _mm256_mask_compressstoreu_epi64(base_addr, k, a) + #define _mm256_mask_compressstoreu_epi64(base_addr, k, a) simde_mm256_mask_compressstoreu_epi64(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -348,7 +360,7 @@ simde_mm256_maskz_compress_epi64 (simde__mmask8 k, simde__m256i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_epi64 - #define _mm256_maskz_compress_epi64(k, a) _mm256_maskz_compress_epi64(k, a) + #define _mm256_maskz_compress_epi64(k, a) simde_mm256_maskz_compress_epi64(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -378,14 +390,17 @@ simde_mm512_mask_compress_pd (simde__m512d src, simde__mmask8 k, simde__m512d a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_pd - #define _mm512_mask_compress_pd(src, k, a) _mm512_mask_compress_pd(src, k, a) + #define _mm512_mask_compress_pd(src, k, a) simde_mm512_mask_compress_pd(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_pd(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_pd(base_addr, store_mask, _mm512_maskz_compress_pd(k, a)); #else simde__m512d_private a_ = simde__m512d_to_private(a); @@ -405,7 +420,7 @@ simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m51 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_pd - #define _mm512_mask_compressstoreu_pd(base_addr, k, a) _mm512_mask_compressstoreu_pd(base_addr, k, a) + #define _mm512_mask_compressstoreu_pd(base_addr, k, a) simde_mm512_mask_compressstoreu_pd(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -434,7 +449,7 @@ simde_mm512_maskz_compress_pd (simde__mmask8 k, simde__m512d a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_pd - #define _mm512_maskz_compress_pd(k, a) _mm512_maskz_compress_pd(k, a) + #define _mm512_maskz_compress_pd(k, a) simde_mm512_maskz_compress_pd(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -464,14 +479,17 @@ simde_mm512_mask_compress_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_ps - #define _mm512_mask_compress_ps(src, k, a) _mm512_mask_compress_ps(src, k, a) + #define _mm512_mask_compress_ps(src, k, a) simde_mm512_mask_compress_ps(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_ps(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask16 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_ps(base_addr, store_mask, _mm512_maskz_compress_ps(k, a)); #else simde__m512_private a_ = simde__m512_to_private(a); @@ -490,8 +508,8 @@ simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m5 #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_pd - #define _mm512_mask_compressstoreu_ps(base_addr, k, a) _mm512_mask_compressstoreu_ps(base_addr, k, a) + #undef _mm512_mask_compressstoreu_ps + #define _mm512_mask_compressstoreu_ps(base_addr, k, a) simde_mm512_mask_compressstoreu_ps(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -520,7 +538,7 @@ simde_mm512_maskz_compress_ps (simde__mmask16 k, simde__m512 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_ps - #define _mm512_maskz_compress_ps(k, a) _mm512_maskz_compress_ps(k, a) + #define _mm512_maskz_compress_ps(k, a) simde_mm512_maskz_compress_ps(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -550,14 +568,47 @@ simde_mm512_mask_compress_epi32 (simde__m512i src, simde__mmask16 k, simde__m512 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_epi32 - #define _mm512_mask_compress_epi32(src, k, a) _mm512_mask_compress_epi32(src, k, a) + #define _mm512_mask_compress_epi32(src, k, a) simde_mm512_mask_compress_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_compressstoreu_epi16 (void* base_addr, simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(__znver4__) + _mm512_mask_compressstoreu_epi16(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(__znver4__) + simde__mmask32 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi16(base_addr, store_mask, _mm512_maskz_compress_epi16(k, a)); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { + if ((k >> i) & 1) { + a_.i16[ri++] = a_.i16[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.i16[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_compressstoreu_epi16 + #define _mm512_mask_compressstoreu_epi16(base_addr, k, a) simde_mm512_mask_compressstoreu_epi16(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi32(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask16 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi32(base_addr, store_mask, _mm512_maskz_compress_epi32(k, a)); #else simde__m512i_private a_ = simde__m512i_to_private(a); @@ -577,7 +628,7 @@ simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_epi32 - #define _mm512_mask_compressstoreu_epi32(base_addr, k, a) _mm512_mask_compressstoreu_epi32(base_addr, k, a) + #define _mm512_mask_compressstoreu_epi32(base_addr, k, a) simde_mm512_mask_compressstoreu_epi32(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -606,7 +657,7 @@ simde_mm512_maskz_compress_epi32 (simde__mmask16 k, simde__m512i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_epi32 - #define _mm512_maskz_compress_epi32(k, a) _mm512_maskz_compress_epi32(k, a) + #define _mm512_maskz_compress_epi32(k, a) simde_mm512_maskz_compress_epi32(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -636,14 +687,17 @@ simde_mm512_mask_compress_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_epi64 - #define _mm512_mask_compress_epi64(src, k, a) _mm512_mask_compress_epi64(src, k, a) + #define _mm512_mask_compress_epi64(src, k, a) simde_mm512_mask_compress_epi64(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi64(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi64(base_addr, store_mask, _mm512_maskz_compress_epi64(k, a)); #else simde__m512i_private a_ = simde__m512i_to_private(a); @@ -663,7 +717,7 @@ simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_epi64 - #define _mm512_mask_compressstoreu_epi64(base_addr, k, a) _mm512_mask_compressstoreu_epi64(base_addr, k, a) + #define _mm512_mask_compressstoreu_epi64(base_addr, k, a) simde_mm512_mask_compressstoreu_epi64(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -692,7 +746,7 @@ simde_mm512_maskz_compress_epi64 (simde__mmask8 k, simde__m512i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_epi64 - #define _mm512_maskz_compress_epi64(k, a) _mm512_maskz_compress_epi64(k, a) + #define _mm512_maskz_compress_epi64(k, a) simde_mm512_maskz_compress_epi64(k, a) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/cvt.h b/x86/avx512/cvt.h index 6abf8e897..579bcac10 100644 --- a/x86/avx512/cvt.h +++ b/x86/avx512/cvt.h @@ -32,6 +32,7 @@ #include "types.h" #include "mov.h" +#include "../../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -106,9 +107,36 @@ simde_mm_maskz_cvtepi64_pd(simde__mmask8 k, simde__m128i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_cvtepi64_pd - #define _mm_maskz_cvtepi64_pd(k, a) simde_mm_maskz_cvtepi64_pd(k, a) + #define _mm_maskz_cvtepi64_pd(k, a) simde_mm_maskz_cvtepi64_pd((k), (a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtepi16_epi32 (simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepi16_epi32(a); + #else + simde__m512i_private r_; + simde__m256i_private a_ = simde__m256i_to_private(a); + + #if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = a_.i16[i]; + } + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepi16_epi32 + #define _mm512_cvtepi16_epi32(a) simde_mm512_cvtepi16_epi32(a) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_cvtepi16_epi8 (simde__m512i a) { @@ -172,7 +200,10 @@ simde_mm512_cvtepi8_epi16 (simde__m256i a) { simde__m512i_private r_; simde__m256i_private a_ = simde__m256i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_X86_AVX2_NATIVE) + r_.m256i[0] = _mm256_cvtepi8_epi16(a_.m128i[0]); + r_.m256i[1] = _mm256_cvtepi8_epi16(a_.m128i[1]); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); #else SIMDE_VECTORIZE @@ -189,6 +220,35 @@ simde_mm512_cvtepi8_epi16 (simde__m256i a) { #define _mm512_cvtepi8_epi16(a) simde_mm512_cvtepi8_epi16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_cvtepi32_ps (simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepi32_ps(a); + #else + simde__m512_private r_; + simde__m512i_private a_ = simde__m512i_to_private(a); + + #if defined(SIMDE_X86_AVX_NATIVE) + r_.m256[0] = _mm256_cvtepi32_ps(a_.m256i[0]); + r_.m256[1] = _mm256_cvtepi32_ps(a_.m256i[1]); + #elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]); + } + #endif + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepi32_ps + #define _mm512_cvtepi32_ps(a) simde_mm512_cvtepi32_ps(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_cvtepi64_epi32 (simde__m512i a) { @@ -212,7 +272,33 @@ simde_mm512_cvtepi64_epi32 (simde__m512i a) { } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_cvtepi64_epi32 - #define _mm512_cvtepi64_epi32(a) simde_mm512_cvtepi64_epi32(a) + #define _mm512_cvtepi64_epi32(a) simde_mm512_cvtepi64_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtepu16_epi32 (simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepu16_epi32(a); + #else + simde__m512i_private r_; + simde__m256i_private a_ = simde__m256i_to_private(a); + + #if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.u16[i]); + } + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepu16_epi32 + #define _mm512_cvtepu16_epi32(a) simde_mm512_cvtepu16_epi32(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -247,8 +333,67 @@ simde_mm512_cvtepu32_ps (simde__m512i a) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepu32_epi32 - #define _mm512_cvtepu32_epi32(a) simde_mm512_cvtepu32_ps(a) + #undef _mm512_cvtepu32_ps + #define _mm512_cvtepu32_ps(a) simde_mm512_cvtepu32_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_cvtph_ps(simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtph_ps(a); + #endif + simde__m256i_private a_ = simde__m256i_to_private(a); + simde__m512_private r_; + + #if defined(SIMDE_X86_F16C_NATIVE) + r_.m256[0] = _mm256_cvtph_ps(a_.m128i[0]); + r_.m256[1] = _mm256_cvtph_ps(a_.m128i[1]); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); + } + #endif + + return simde__m512_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtph_ps + #define _mm512_cvtph_ps(a) simde_mm512_cvtph_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtps_epi32(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtps_epi32(a); + #endif + simde__m512_private a_ = simde__m512_to_private(a); + simde__m512i_private r_; + + #if defined(SIMDE_X86_AVX_NATIVE) + r_.m256i[0] = _mm256_cvtps_epi32(a_.m256[0]); + r_.m256i[1] = _mm256_cvtps_epi32(a_.m256[1]); + #elif defined(simde_math_nearbyintf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtps_epi32 + #define _mm512_cvtps_epi32(a) simde_mm512_cvtps_epi32(a) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/cvts.h b/x86/avx512/cvts.h index c35c2f9e4..0194889a7 100644 --- a/x86/avx512/cvts.h +++ b/x86/avx512/cvts.h @@ -31,6 +31,8 @@ #include "types.h" #include "mov.h" +#include "storeu.h" +#include "loadu.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -362,6 +364,34 @@ simde_mm512_mask_cvtsepi32_epi8 (simde__m128i src, simde__mmask16 k, simde__m512 #define _mm512_mask_cvtsepi32_epi8(src, k, a) simde_mm512_mask_cvtsepi32_epi8(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a); + #else + simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r_.i8[i] = ((k>>i) &1 ) ? + ((a_.i32[i] < INT8_MIN) + ? (INT8_MIN) + : ((a_.i32[i] > INT8_MAX) + ? (INT8_MAX) + : HEDLEY_STATIC_CAST(int8_t, a_.i32[i]))) : r_.i8[i]; + } + + simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtsepi32_storeu_epi8 + #define _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm512_maskz_cvtsepi32_epi8 (simde__mmask16 k, simde__m512i a) { @@ -444,6 +474,34 @@ simde_mm512_mask_cvtsepi32_epi16 (simde__m256i src, simde__mmask16 k, simde__m51 #define _mm512_mask_cvtsepi32_epi16(src, k, a) simde_mm512_mask_cvtsepi32_epi16(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a); + #else + simde__m256i_private r_; + simde__m256i_private src_ = simde__m256i_to_private(simde_mm256_loadu_epi16(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r_.i16[i] = ((k>>i) &1 ) ? + ((a_.i32[i] < INT16_MIN) + ? (INT16_MIN) + : ((a_.i32[i] > INT16_MAX) + ? (INT16_MAX) + : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]))) : src_.i16[i]; + } + + simde_mm256_storeu_epi16(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtsepi32_storeu_epi16 + #define _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_maskz_cvtsepi32_epi16 (simde__mmask16 k, simde__m512i a) { diff --git a/x86/avx512/cvtt.h b/x86/avx512/cvtt.h index 044507ce4..937f7fb72 100644 --- a/x86/avx512/cvtt.h +++ b/x86/avx512/cvtt.h @@ -98,6 +98,32 @@ simde_mm_maskz_cvttpd_epi64(simde__mmask8 k, simde__m128d a) { #define _mm_maskz_cvttpd_epi64(k, a) simde_mm_maskz_cvttpd_epi64(k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvttps_epi32 (simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvttps_epi32(a); + #else + simde__m512i_private r_; + simde__m512_private a_ = simde__m512_to_private(a); + + #if defined(simde_math_truncf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvttps_epi32 + #define _mm512_cvttps_epi32(a) simde_mm512_cvttps_epi32(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/cvtus.h b/x86/avx512/cvtus.h new file mode 100644 index 000000000..ce423f6c9 --- /dev/null +++ b/x86/avx512/cvtus.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_CVTUS_H) +#define SIMDE_X86_AVX512_CVTUS_H + +#include "types.h" +#include "mov.h" +#include "storeu.h" +#include "loadu.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a); + #else + simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r_.i8[i] = ((k>>i) &1 ) ? + ((a_.u32[i] > UINT8_MAX) + ? (HEDLEY_STATIC_CAST(int8_t, UINT8_MAX)) + : HEDLEY_STATIC_CAST(int8_t, a_.u32[i])) : r_.i8[i]; + } + + simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtusepi32_storeu_epi8 + #define _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtusepi32_storeu_epi8((base_addr), (k), (a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_CVTUS_H) */ diff --git a/x86/avx512/dpbf16.h b/x86/avx512/dpbf16.h index 56f2c68f1..81e2aead2 100644 --- a/x86/avx512/dpbf16.h +++ b/x86/avx512/dpbf16.h @@ -20,7 +20,7 @@ simde_mm_dpbf16_ps (simde__m128 src, simde__m128bh a, simde__m128bh b) { a_ = simde__m128bh_to_private(a), b_ = simde__m128bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(32); uint32_t x2 SIMDE_VECTOR(32); simde__m128_private @@ -109,7 +109,7 @@ simde_mm256_dpbf16_ps (simde__m256 src, simde__m256bh a, simde__m256bh b) { a_ = simde__m256bh_to_private(a), b_ = simde__m256bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(64); uint32_t x2 SIMDE_VECTOR(64); simde__m256_private @@ -198,7 +198,7 @@ simde_mm512_dpbf16_ps (simde__m512 src, simde__m512bh a, simde__m512bh b) { a_ = simde__m512bh_to_private(a), b_ = simde__m512bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(128); uint32_t x2 SIMDE_VECTOR(128); simde__m512_private diff --git a/x86/avx512/extract.h b/x86/avx512/extract.h index 2261513ea..251715cf4 100644 --- a/x86/avx512/extract.h +++ b/x86/avx512/extract.h @@ -35,6 +35,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm256_extractf32x4_ps (simde__m256 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m256_private a_ = simde__m256_to_private(a); + + return a_.m128[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_extractf32x4_ps(a, imm8) _mm256_extractf32x4_ps(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_extractf32x4_ps + #define _mm256_extractf32x4_ps(a, imm8) simde_mm256_extractf32x4_ps((a), (imm8)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm512_extractf32x4_ps (simde__m512 a, int imm8) @@ -61,27 +78,43 @@ simde_mm512_extractf32x4_ps (simde__m512 a, int imm8) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_extractf32x4_ps - #define _mm512_extractf32x4_ps(a, imm8) simde_mm512_extractf32x4_ps(a, imm8) + #define _mm512_extractf32x4_ps(a, imm8) simde_mm512_extractf32x4_ps((a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) _mm512_mask_extractf32x4_ps(src, k, a, imm8) #else - #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm_mask_mov_ps(src, k, simde_mm512_extractf32x4_ps(a, imm8)) + #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm_mask_mov_ps((src), (k), simde_mm512_extractf32x4_ps((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_extractf32x4_ps - #define _mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) + #define _mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm512_mask_extractf32x4_ps((src), (k), (a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) _mm512_maskz_extractf32x4_ps(k, a, imm8) #else - #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm_maskz_mov_ps(k, simde_mm512_extractf32x4_ps(a, imm8)) + #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm_maskz_mov_ps((k), simde_mm512_extractf32x4_ps((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_extractf32x4_ps - #define _mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm512_maskz_extractf32x4_ps(k, a, imm8) + #define _mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm512_maskz_extractf32x4_ps((k), (a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm512_extractf32x8_ps (simde__m512 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m512_private a_ = simde__m512_to_private(a); + + return a_.m256[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) + #define simde_mm512_extractf32x8_ps(a, imm8) _mm512_extractf32x8_ps(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_extractf32x8_ps + #define _mm512_extractf32x8_ps(a, imm8) simde_mm512_extractf32x8_ps(a, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -156,6 +189,42 @@ simde_mm512_extracti32x4_epi32 (simde__m512i a, int imm8) #define _mm512_maskz_extracti32x4_epi32(k, a, imm8) simde_mm512_maskz_extracti32x4_epi32(k, a, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm512_extracti32x8_epi32 (simde__m512i a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m512i_private a_ = simde__m512i_to_private(a); + + return a_.m256i[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_extracti32x8_epi32(a, imm8) _mm512_extracti32x8_epi32(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_extracti32x8_epi32 + #define _mm512_extracti32x8_epi32(a, imm8) simde_mm512_extracti32x8_epi32((a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX51FDQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) _mm512_mask_extracti32x8_epi32(src, k, a, imm8) +#else + #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32((src), (k), simde_mm512_extracti32x8_epi32((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_extracti32x8_epi32 + #define _mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm512_mask_extracti32x8_epi32((src), (k), (a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) _mm512_maskz_extracti32x8_epi32(k, a, imm8) +#else + #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32((k), simde_mm512_extracti32x8_epi32((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_extracti32x8_epi32 + #define _mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm512_maskz_extracti32x8_epi32((k), (a), (imm8)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_extracti64x4_epi64 (simde__m512i a, int imm8) @@ -169,27 +238,27 @@ simde_mm512_extracti64x4_epi64 (simde__m512i a, int imm8) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_extracti64x4_epi64 - #define _mm512_extracti64x4_epi64(a, imm8) simde_mm512_extracti64x4_epi64(a, imm8) + #define _mm512_extracti64x4_epi64(a, imm8) simde_mm512_extracti64x4_epi64((a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) _mm512_mask_extracti64x4_epi64(src, k, a, imm8) #else - #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm512_extracti64x4_epi64(a, imm8)) + #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64((src), (k), simde_mm512_extracti64x4_epi64((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_extracti64x4_epi64 - #define _mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) + #define _mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm512_mask_extracti64x4_epi64((src), (k), (a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) _mm512_maskz_extracti64x4_epi64(k, a, imm8) #else - #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm512_extracti64x4_epi64(a, imm8)) + #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64((k), simde_mm512_extracti64x4_epi64((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_extracti64x4_epi64 - #define _mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) + #define _mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm512_maskz_extracti64x4_epi64((k), (a), (imm8)) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/fmaddsub.h b/x86/avx512/fmaddsub.h new file mode 100644 index 000000000..f1139e4d6 --- /dev/null +++ b/x86/avx512/fmaddsub.h @@ -0,0 +1,91 @@ +#if !defined(SIMDE_X86_AVX512_FMADDSUB_H) +#define SIMDE_X86_AVX512_FMADDSUB_H + +#include "types.h" +#include "../fma.h" +#include "mul.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_x_mm512_addsub_pd (simde__m512d a, simde__m512d b) { + //mm512_addsub_pd does not exist, but we define it for utility purposes (only with simde_x prefix, no native alias) + simde__m512d_private + r_, + a_ = simde__m512d_to_private(a), + b_ = simde__m512d_to_private(b); + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r_.m256d[0] = simde_mm256_addsub_pd(a_.m256d[0], b_.m256d[0]); + r_.m256d[1] = simde_mm256_addsub_pd(a_.m256d[1], b_.m256d[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { + r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ]; + r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1]; + } + #endif + + return simde__m512d_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_x_mm512_addsub_ps (simde__m512 a, simde__m512 b) { + //mm512_addsub_ps does not exist, but we define it for utility purposes (only with simde_x prefix, no native alias) + simde__m512_private + r_, + a_ = simde__m512_to_private(a), + b_ = simde__m512_to_private(b); + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r_.m256[0] = simde_mm256_addsub_ps(a_.m256[0], b_.m256[0]); + r_.m256[1] = simde_mm256_addsub_ps(a_.m256[1], b_.m256[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { + r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ]; + r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1]; + } + #endif + + return simde__m512_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_fmaddsub_pd (simde__m512d a, simde__m512d b, simde__m512d c) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_fmaddsub_pd(a, b, c); + #else + return simde_x_mm512_addsub_pd(simde_mm512_mul_pd(a, b), c); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_fmaddsub_pd + #define _mm512_fmaddsub_pd(a, b, c) simde_mm512_fmaddsub_pd(a, b, c) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_fmaddsub_ps (simde__m512 a, simde__m512 b, simde__m512 c) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_fmaddsub_ps(a, b, c); + #else + return simde_x_mm512_addsub_ps(simde_mm512_mul_ps(a, b), c); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_fmaddsub_ps + #define _mm512_fmaddsub_ps(a, b, c) simde_mm512_fmaddsub_ps(a, b, c) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_FMADDSUB_H) */ diff --git a/x86/avx512/fmsub.h b/x86/avx512/fmsub.h index 626294cb3..4f52d4074 100644 --- a/x86/avx512/fmsub.h +++ b/x86/avx512/fmsub.h @@ -47,7 +47,7 @@ simde_mm256_mask3_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c, simd } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask3_fmsub_pd - #define _mm256_mask3_fmsub_pd(a, b, c, k) _mm256_mask3_fmsub_pd(a, b, c, k) + #define _mm256_mask3_fmsub_pd(a, b, c, k) simde_mm256_mask3_fmsub_pd(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -61,7 +61,7 @@ simde_mm256_mask_fmsub_pd (simde__m256d a, simde__mmask8 k, simde__m256d b, simd } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_fmsub_pd - #define _mm256_mask_fmsub_pd(a, k, b, c) _mm256_mask_fmsub_pd(a, k, b, c) + #define _mm256_mask_fmsub_pd(a, k, b, c) simde_mm256_mask_fmsub_pd(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -75,7 +75,7 @@ simde_mm256_maskz_fmsub_pd (simde__mmask8 k, simde__m256d a, simde__m256d b, sim } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_fmsub_pd - #define _mm256_maskz_fmsub_pd(k, a, b, c) _mm256_maskz_fmsub_pd(k, a, b, c) + #define _mm256_maskz_fmsub_pd(k, a, b, c) simde_mm256_maskz_fmsub_pd(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm_mask3_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask3_fmsub_pd - #define _mm_mask3_fmsub_pd(a, b, c, k) _mm_mask3_fmsub_pd(a, b, c, k) + #define _mm_mask3_fmsub_pd(a, b, c, k) simde_mm_mask3_fmsub_pd(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -103,7 +103,7 @@ simde_mm_mask_fmsub_pd (simde__m128d a, simde__mmask8 k, simde__m128d b, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_fmsub_pd - #define _mm_mask_fmsub_pd(a, k, b, c) _mm_mask_fmsub_pd(a, k, b, c) + #define _mm_mask_fmsub_pd(a, k, b, c) simde_mm_mask_fmsub_pd(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -117,7 +117,7 @@ simde_mm_maskz_fmsub_pd (simde__mmask8 k, simde__m128d a, simde__m128d b, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_fmsub_pd - #define _mm_maskz_fmsub_pd(k, a, b, c) _mm_maskz_fmsub_pd(k, a, b, c) + #define _mm_maskz_fmsub_pd(k, a, b, c) simde_mm_maskz_fmsub_pd(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -131,7 +131,7 @@ simde_mm256_mask3_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask3_fmsub_ps - #define _mm256_mask3_fmsub_ps(a, b, c, k) _mm256_mask3_fmsub_ps(a, b, c, k) + #define _mm256_mask3_fmsub_ps(a, b, c, k) simde_mm256_mask3_fmsub_ps(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -145,7 +145,7 @@ simde_mm256_mask_fmsub_ps (simde__m256 a, simde__mmask8 k, simde__m256 b, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_fmsub_ps - #define _mm256_mask_fmsub_ps(a, k, b, c) _mm256_mask_fmsub_ps(a, k, b, c) + #define _mm256_mask_fmsub_ps(a, k, b, c) simde_mm256_mask_fmsub_ps(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -159,7 +159,7 @@ simde_mm256_maskz_fmsub_ps (simde__mmask8 k, simde__m256 a, simde__m256 b, simde } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_fmsub_ps - #define _mm256_maskz_fmsub_ps(k, a, b, c) _mm256_maskz_fmsub_ps(k, a, b, c) + #define _mm256_maskz_fmsub_ps(k, a, b, c) simde_mm256_maskz_fmsub_ps(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -173,7 +173,7 @@ simde_mm_mask3_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c, simde__mma } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask3_fmsub_ps - #define _mm_mask3_fmsub_ps(a, b, c, k) _mm_mask3_fmsub_ps(a, b, c, k) + #define _mm_mask3_fmsub_ps(a, b, c, k) simde_mm_mask3_fmsub_ps(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -187,7 +187,7 @@ simde_mm_mask_fmsub_ps (simde__m128 a, simde__mmask8 k, simde__m128 b, simde__m1 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_fmsub_ps - #define _mm_mask_fmsub_ps(a, k, b, c) _mm_mask_fmsub_ps(a, k, b, c) + #define _mm_mask_fmsub_ps(a, k, b, c) simde_mm_mask_fmsub_ps(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -201,7 +201,7 @@ simde_mm_maskz_fmsub_ps (simde__mmask8 k, simde__m128 a, simde__m128 b, simde__m } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_fmsub_ps - #define _mm_maskz_fmsub_ps(k, a, b, c) _mm_maskz_fmsub_ps(k, a, b, c) + #define _mm_maskz_fmsub_ps(k, a, b, c) simde_mm_maskz_fmsub_ps(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/x86/avx512/fpclass.h b/x86/avx512/fpclass.h new file mode 100644 index 000000000..1765570d7 --- /dev/null +++ b/x86/avx512/fpclass.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_FPCLASS_H) +#define SIMDE_X86_AVX512_FPCLASS_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_fpclass_ps_mask(simde__m256 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask8 r = 0; + simde__m256_private a_ = simde__m256_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r |= simde_math_fpclassf(a_.f32[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) +# define simde_mm256_fpclass_ps_mask(a, imm8) _mm256_fpclass_ps_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +# undef _mm256_fpclass_ps_mask +# define _mm256_fpclass_ps_mask(a, imm8) simde_mm256_fpclass_ps_mask((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_fpclass_ph_mask(simde__m512h a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask32 r = 0; + simde__m512h_private a_ = simde__m512h_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r |= simde_fpclasshf(a_.f16[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512FP16_NATIVE) +# define simde_mm512_fpclass_ph_mask(a, imm8) _mm512_fpclass_ph_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# undef _mm512_fpclass_ph_mask +# define _mm512_fpclass_ph_mask(a, imm8) simde_mm512_fpclass_ph_mask((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_fpclass_pd_mask(simde__m512d a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask8 r = 0; + simde__m512d_private a_ = simde__m512d_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r |= simde_math_fpclass(a_.f64[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) +# define simde_mm512_fpclass_pd_mask(a, imm8) _mm512_fpclass_pd_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) +# undef _mm512_fpclass_pd_mask +# define _mm512_fpclass_pd_mask(a, imm8) simde_mm512_fpclass_pd_mask((a), (imm8)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_FPCLASS_H) */ diff --git a/x86/avx512/gather.h b/x86/avx512/gather.h new file mode 100644 index 000000000..8dec2ee0a --- /dev/null +++ b/x86/avx512/gather.h @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_GATHER_H) +#define SIMDE_X86_AVX512_GATHER_H + +#include "types.h" +#include "../avx2.h" +#include "extract.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_i32gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_ = simde__m512i_to_private(vindex); + simde__m512_private r_ = simde__m512_to_private(simde_mm512_setzero_ps()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float32 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f32[i] = dst; + } + + return simde__m512_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0)) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) _mm512_i32gather_ps((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512_private simde_mm512_i32gather_ps_r_; \ + simde__m512i_private simde_mm512_i32gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i32gather_ps_r_.m256[0] = _mm256_i32gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[0], (scale)); \ + simde_mm512_i32gather_ps_r_.m256[1] = _mm256_i32gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[1], (scale)); \ + simde__m512_from_private(simde_mm512_i32gather_ps_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) \ + simde_x_mm512_set_m256( \ + _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ + _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i32gather_ps + #define _mm512_i32gather_ps(vindex, base_addr, scale) simde_mm512_i32gather_ps((vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm512_i64gather_epi32(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m256i_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + int32_t dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.i32[i] = dst; + } + + return simde__m256i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) _mm512_i64gather_epi32((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m256i_private simde_mm512_i64gather_epi32_r_; \ + simde__m512i_private simde_mm512_i64gather_epi32_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_epi32_r_.m128i[0] = _mm256_i64gather_epi32( \ + HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_epi32_r_.m128i[1] = _mm256_i64gather_epi32( \ + HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[1], (scale)); \ + simde__m256i_from_private(simde_mm512_i64gather_epi32_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) \ + _mm256_insertf128_si256( \ + _mm256_castsi128_si256( \ + _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ + _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + 1) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_epi32 + #define _mm512_i64gather_epi32(vindex, base_addr, scale) simde_mm512_i64gather_epi32((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_epi32(src, k, simde_mm512_i64gather_epi32((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_epi32 + #define _mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_i64gather_epi64(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private + vindex_ = simde__m512i_to_private(vindex), + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + int64_t dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.i64[i] = dst; + } + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) _mm512_i64gather_epi64((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512i_private simde_mm512_i64gather_epi64_r_, \ + simde_mm512_i64gather_epi64_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_epi64_r_.m256i[0] = _mm256_i64gather_epi64( \ + HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_epi64_r_.m256i[1] = _mm256_i64gather_epi64( \ + HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[1], (scale)); \ + simde__m512i_from_private(simde_mm512_i64gather_epi64_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) \ + simde_x_mm512_set_m256i( \ + _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ + _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_epi64 + #define _mm512_i64gather_epi64(vindex, base_addr, scale) simde_mm512_i64gather_epi64(vindex, (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_i64gather_epi64((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_epi64 + #define _mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_i64gather_pd(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m512d_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m512d_to_private(simde_mm512_setzero_pd()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float64 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f64[i] = dst; + } + + return simde__m512d_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) _mm512_i64gather_pd((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512d_private simde_mm512_i64gather_pd_r_; \ + simde__m512i_private simde_mm512_i64gather_pd_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_pd_r_.m256d[0] = _mm256_i64gather_pd( \ + HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_pd_r_.m256d[1] = _mm256_i64gather_pd( \ + HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[1], (scale)); \ + simde__m512d_from_private(simde_mm512_i64gather_pd_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) \ + simde_x_mm512_set_m256d( \ + _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_pd + #define _mm512_i64gather_pd(vindex, base_addr, scale) simde_mm512_i64gather_pd((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_pd((src), (k), simde_mm512_i64gather_pd((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_pd + #define _mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm512_i64gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m256_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m256_to_private(simde_mm256_setzero_ps()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float32 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f32[i] = dst; + } + + return simde__m256_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) _mm512_i64gather_ps((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m256_private simde_mm512_i64gather_ps_r_; \ + simde__m512i_private simde_mm512_i64gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_ps_r_.m128[0] = _mm256_i64gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_ps_r_.m128[1] = _mm256_i64gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[1], (scale)); \ + simde__m256_from_private(simde_mm512_i64gather_ps_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) \ + _mm256_insertf128_ps( \ + _mm256_castps128_ps256( \ + _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ + _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + 1) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_ps + #define _mm512_i64gather_ps(vindex, base_addr, scale) simde_mm512_i64gather_ps((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_ps((src), (k), simde_mm512_i64gather_ps((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_ps + #define _mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_GATHER_H) */ diff --git a/x86/avx512/insert.h b/x86/avx512/insert.h index 5a9da038a..67120d31c 100644 --- a/x86/avx512/insert.h +++ b/x86/avx512/insert.h @@ -41,7 +41,13 @@ simde_mm512_insertf32x4 (simde__m512 a, simde__m128 b, int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { #if defined(SIMDE_X86_AVX512F_NATIVE) simde__m512 r; - SIMDE_CONSTIFY_4_(_mm512_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); + switch(imm8) { + case 0: r = _mm512_insertf32x4(a, b, 0); break; + case 1: r = _mm512_insertf32x4(a, b, 1); break; + case 2: r = _mm512_insertf32x4(a, b, 2); break; + case 3: r = _mm512_insertf32x4(a, b, 3); break; + default: HEDLEY_UNREACHABLE(); r = simde_mm512_setzero_ps(); break; + } return r; #else simde__m512_private a_ = simde__m512_to_private(a); @@ -295,7 +301,7 @@ simde_mm512_mask_insertf32x8(simde__m512 src, simde__mmask16 k, simde__m512 a, s } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_insertf32x8 - #define _mm512_mask_insertf32x8(src, k, a, b, imm8) simde_mm512_mask_insertf32x8(src, k, a, b, imms8) + #define _mm512_mask_insertf32x8(src, k, a, b, imm8) simde_mm512_mask_insertf32x8(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -313,7 +319,7 @@ simde_mm512_maskz_insertf32x8(simde__mmask16 k, simde__m512 a, simde__m256 b, co } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_insertf32x8 - #define _mm512_maskz_insertf32x8(k, a, b, imm8) simde_mm512_maskz_insertf32x8(k, a, b, imms8) + #define _mm512_maskz_insertf32x8(k, a, b, imm8) simde_mm512_maskz_insertf32x8(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -349,7 +355,7 @@ simde_mm512_mask_insertf64x2(simde__m512d src, simde__mmask8 k, simde__m512d a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_insertf64x2 - #define _mm512_mask_insertf64x2(src, k, a, b, imm8) simde_mm512_mask_insertf64x2(src, k, a, b, imms8) + #define _mm512_mask_insertf64x2(src, k, a, b, imm8) simde_mm512_mask_insertf64x2(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -367,7 +373,7 @@ simde_mm512_maskz_insertf64x2(simde__mmask8 k, simde__m512d a, simde__m128d b, c } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_insertf64x2 - #define _mm512_maskz_insertf64x2(k, a, b, imm8) simde_mm512_maskz_insertf64x2(k, a, b, imms8) + #define _mm512_maskz_insertf64x2(k, a, b, imm8) simde_mm512_maskz_insertf64x2(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -403,7 +409,7 @@ simde_mm512_mask_inserti32x8(simde__m512i src, simde__mmask16 k, simde__m512i a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_inserti32x8 - #define _mm512_mask_inserti32x8(src, k, a, b, imm8) simde_mm512_mask_inserti32x8(src, k, a, b, imms8) + #define _mm512_mask_inserti32x8(src, k, a, b, imm8) simde_mm512_mask_inserti32x8(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -421,7 +427,7 @@ simde_mm512_maskz_inserti32x8(simde__mmask16 k, simde__m512i a, simde__m256i b, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_inserti32x8 - #define _mm512_maskz_inserti32x8(k, a, b, imm8) simde_mm512_maskz_inserti32x8(k, a, b, imms8) + #define _mm512_maskz_inserti32x8(k, a, b, imm8) simde_mm512_maskz_inserti32x8(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -457,7 +463,7 @@ simde_mm512_mask_inserti64x2(simde__m512i src, simde__mmask8 k, simde__m512i a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_inserti64x2 - #define _mm512_mask_inserti64x2(src, k, a, b, imm8) simde_mm512_mask_inserti64x2(src, k, a, b, imms8) + #define _mm512_mask_inserti64x2(src, k, a, b, imm8) simde_mm512_mask_inserti64x2(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -475,7 +481,7 @@ simde_mm512_maskz_inserti64x2(simde__mmask8 k, simde__m512i a, simde__m128i b, c } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_inserti64x2 - #define _mm512_maskz_inserti64x2(k, a, b, imm8) simde_mm512_maskz_inserti64x2(k, a, b, imms8) + #define _mm512_maskz_inserti64x2(k, a, b, imm8) simde_mm512_maskz_inserti64x2(k, a, b, imm8) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/kand.h b/x86/avx512/kand.h new file mode 100644 index 000000000..786410007 --- /dev/null +++ b/x86/avx512/kand.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KAND_H) +#define SIMDE_X86_AVX512_KAND_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_kand (simde__mmask16 a, simde__mmask16 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_kand(a, b); + #else + return a & b; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_kand + #define _mm512_kand(a, b) simde_mm512_kand((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KAND_H) */ diff --git a/x86/avx512/knot.h b/x86/avx512/knot.h new file mode 100644 index 000000000..3b4696e8b --- /dev/null +++ b/x86/avx512/knot.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KNOT_H) +#define SIMDE_X86_AVX512_KNOT_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_knot_mask8 (simde__mmask8 a) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask8(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _knot_mask8 + #define _knot_mask8(a) simde_knot_mask8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_knot_mask16 (simde__mmask16 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask16(a); + #else + return ~a; + #endif +} +#define simde_mm512_knot(a) simde_knot_mask16(a) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _knot_mask16 + #undef _mm512_knot + #define _knot_mask16(a) simde_knot_mask16(a) + #define _mm512_knot(a) simde_knot_mask16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_knot_mask32 (simde__mmask32 a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask32(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _knot_mask32 + #define _knot_mask32(a) simde_knot_mask32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_knot_mask64 (simde__mmask64 a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask64(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _knot_mask64 + #define _knot_mask64(a) simde_knot_mask64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KNOT_H) */ diff --git a/x86/avx512/kxor.h b/x86/avx512/kxor.h new file mode 100644 index 000000000..45f5d04da --- /dev/null +++ b/x86/avx512/kxor.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KXOR_H) +#define SIMDE_X86_AVX512_KXOR_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_kxor_mask8 (simde__mmask8 a, simde__mmask8 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask8(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask8 + #define _kxor_mask8(a, b) simde_kxor_mask8(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_kxor_mask16 (simde__mmask16 a, simde__mmask16 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask16(a, b); + #else + return a^b; + #endif +} +#define simde_mm512_kxor(a, b) simde_kxor_mask16(a, b) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask16 + #undef _mm512_kxor + #define _kxor_mask16(a, b) simde_kxor_mask16(a, b) + #define _mm512_kxor(a, b) simde_kxor_mask16(a, b) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_kxor_mask32 (simde__mmask32 a, simde__mmask32 b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask32(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask32 + #define _kxor_mask32(a, b) simde_kxor_mask32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_kxor_mask64 (simde__mmask64 a, simde__mmask64 b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask64(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask64 + #define _kxor_mask64(a, b) simde_kxor_mask64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KXOR_H) */ diff --git a/x86/avx512/load.h b/x86/avx512/load.h index 03d7327c7..6a4af937d 100644 --- a/x86/avx512/load.h +++ b/x86/avx512/load.h @@ -33,6 +33,54 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_load_pd (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_load_pd(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d)); + #else + simde__m512d r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_pd + #define _mm512_load_pd(a) simde_mm512_load_pd(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_load_ps (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_load_ps(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512)); + #else + simde__m512 r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_ps + #define _mm512_load_ps(a) simde_mm512_load_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_load_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_load_ph(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h)); + #else + simde__m512h r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_ph + #define _mm512_load_ph(a) simde_mm512_load_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_load_si512 (void const * mem_addr) { diff --git a/x86/avx512/loadu.h b/x86/avx512/loadu.h index 06f3bd83b..4a31966b4 100644 --- a/x86/avx512/loadu.h +++ b/x86/avx512/loadu.h @@ -73,46 +73,222 @@ simde_mm512_loadu_pd (void const * mem_addr) { #define _mm512_loadu_pd(a) simde_mm512_loadu_pd(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_loadu_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_loadu_ph(mem_addr); + #else + simde__m512h r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_loadu_ph + #define _mm512_loadu_ph(a) simde_mm512_loadu_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_loadu_si512 (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_loadu_si512(HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + simde__m512i r; + #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_PACKED_ + struct simde_mm512_loadu_si512_s { + __typeof__(r) v; + } __attribute__((__packed__, __may_alias__)); + r = HEDLEY_REINTERPRET_CAST(const struct simde_mm512_loadu_si512_s *, mem_addr)->v; + HEDLEY_DIAGNOSTIC_POP #else - simde__m512i r; - - #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_PACKED_ - struct simde_mm512_loadu_si512_s { - __typeof__(r) v; - } __attribute__((__packed__, __may_alias__)); - r = HEDLEY_REINTERPRET_CAST(const struct simde_mm512_loadu_si512_s *, mem_addr)->v; - HEDLEY_DIAGNOSTIC_POP - #else - simde_memcpy(&r, mem_addr, sizeof(r)); - #endif - - return r; + simde_memcpy(&r, mem_addr, sizeof(r)); #endif + + return r; } -#define simde_mm512_loadu_epi8(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi16(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi32(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi64(mem_addr) simde_mm512_loadu_si512(mem_addr) +#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(10,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) + #define simde_mm512_loadu_si512(mem_addr) _mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi32(mem_addr) _mm512_loadu_epi32(mem_addr) + #define simde_mm512_loadu_epi64(mem_addr) _mm512_loadu_epi64(mem_addr) +#else + #define simde_mm512_loadu_epi32(mem_addr) simde_mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi64(mem_addr) simde_mm512_loadu_si512(mem_addr) +#endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(11,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) + #define simde_mm512_loadu_epi8(mem_addr) _mm512_loadu_epi8(mem_addr) + #define simde_mm512_loadu_epi16(mem_addr) _mm512_loadu_epi16(mem_addr) +#else + #define simde_mm512_loadu_epi8(mem_addr) simde_mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi16(mem_addr) simde_mm512_loadu_si512(mem_addr) +#endif #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_loadu_epi8 #undef _mm512_loadu_epi16 - #define _mm512_loadu_epi8(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi16(a) simde_mm512_loadu_si512(a) + #define _mm512_loadu_epi8(a) simde_mm512_loadu_epi8(a) + #define _mm512_loadu_epi16(a) simde_mm512_loadu_epi16(a) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_loadu_epi32 #undef _mm512_loadu_epi64 #undef _mm512_loadu_si512 #define _mm512_loadu_si512(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi32(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi64(a) simde_mm512_loadu_si512(a) + #define _mm512_loadu_epi32(a) simde_mm512_loadu_epi32(a) + #define _mm512_loadu_epi64(a) simde_mm512_loadu_epi64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_loadu_epi16 (simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_epi16(k, simde_mm256_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_epi16 + #define _mm256_maskz_loadu_epi16(k, mem_addr) simde_mm256_maskz_loadu_epi16(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_maskz_loadu_ps (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_ps(k, simde_mm256_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_ps + #define _mm256_maskz_loadu_ps(k, mem_addr) simde_mm256_maskz_loadu_ps(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi16 (simde__m512i src, simde__mmask32 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_loadu_epi16(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi16(src, k, simde_mm512_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi16 + #define _mm512_mask_loadu_epi16(src, k, mem_addr) simde_mm512_mask_loadu_epi16(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi16 (simde__mmask32 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi16(k, simde_mm512_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi16 + #define _mm512_maskz_loadu_epi16(k, mem_addr) simde_mm512_maskz_loadu_epi16(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi32 (simde__m512i src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_epi32(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi32(src, k, simde_mm512_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi32 + #define _mm512_mask_loadu_epi32(src, k, mem_addr) simde_mm512_mask_loadu_epi32(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi64 (simde__m512i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_epi64(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi64(src, k, simde_mm512_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi64 + #define _mm512_mask_loadu_epi64(src, k, mem_addr) simde_mm512_mask_loadu_epi64(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi64 (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_epi64(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi64(k, simde_mm512_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi64 + #define _mm512_maskz_loadu_epi64(k, mem_addr) simde_mm512_maskz_loadu_epi64((k), (mem_addr)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_mask_loadu_pd (simde__m512d src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_pd(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_pd(src, k, simde_mm512_loadu_pd(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_pd + #define _mm512_mask_loadu_pd(src, k, mem_addr) simde_mm512_mask_loadu_pd(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_mask_loadu_ps (simde__m512 src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_ps(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_ps(src, k, simde_mm512_loadu_ps(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_ps + #define _mm512_mask_loadu_ps(src, k, mem_addr) simde_mm512_mask_loadu_ps(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_maskz_loadu_ps (simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_ps(k, simde_mm512_loadu_ps(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_ps + #define _mm512_maskz_loadu_ps(k, mem_addr) simde_mm512_maskz_loadu_ps(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_maskz_loadu_pd (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_pd(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_pd(k, simde_mm512_loadu_pd(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_pd + #define _mm512_maskz_loadu_pd(k, mem_addr) simde_mm512_maskz_loadu_pd(k, mem_addr) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/madd.h b/x86/avx512/madd.h index 153bf067d..547d71ce4 100644 --- a/x86/avx512/madd.h +++ b/x86/avx512/madd.h @@ -61,7 +61,7 @@ simde_mm_maskz_madd_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_madd_epi16 - #define _mm_maskz_madd_epi16(src, k, a, b) simde_mm_maskz_madd_epi16(src, k, a, b) + #define _mm_maskz_madd_epi16(k, a, b) simde_mm_maskz_madd_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm256_maskz_madd_epi16 (simde__mmask8 k, simde__m256i a, simde__m256i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_madd_epi16 - #define _mm256_maskz_madd_epi16(src, k, a, b) simde_mm256_maskz_madd_epi16(src, k, a, b) + #define _mm256_maskz_madd_epi16(k, a, b) simde_mm256_maskz_madd_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -120,7 +120,7 @@ simde_mm512_madd_epi16 (simde__m512i a, simde__m512i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_madd_epi16 - #define _mm512_madd_epi16(src, k, a, b) simde_mm512_madd_epi16(src, k, a, b) + #define _mm512_madd_epi16(a, b) simde_mm512_madd_epi16(a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -148,7 +148,7 @@ simde_mm512_maskz_madd_epi16 (simde__mmask16 k, simde__m512i a, simde__m512i b) } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_madd_epi16 - #define _mm512_maskz_madd_epi16(src, k, a, b) simde_mm512_maskz_madd_epi16(src, k, a, b) + #define _mm512_maskz_madd_epi16(k, a, b) simde_mm512_maskz_madd_epi16(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/maddubs.h b/x86/avx512/maddubs.h index 4b3d73917..43b5594cf 100644 --- a/x86/avx512/maddubs.h +++ b/x86/avx512/maddubs.h @@ -48,7 +48,7 @@ simde_mm_mask_maddubs_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a, } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_maddubs_epi16 - #define _mm_mask_maddubs_epi16(a, b) simde_mm_mask_maddubs_epi16(a, b) + #define _mm_mask_maddubs_epi16(src, k, a, b) simde_mm_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -62,7 +62,7 @@ simde_mm_maskz_maddubs_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_maddubs_epi16 - #define _mm_maskz_maddubs_epi16(a, b) simde_mm_maskz_maddubs_epi16(a, b) + #define _mm_maskz_maddubs_epi16(k, a, b) simde_mm_maskz_maddubs_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -76,7 +76,7 @@ simde_mm256_mask_maddubs_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_maddubs_epi16 - #define _mm256_mask_maddubs_epi16(a, b) simde_mm256_mask_maddubs_epi16(a, b) + #define _mm256_mask_maddubs_epi16(src, k, a, b) simde_mm256_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -90,7 +90,7 @@ simde_mm256_maskz_maddubs_epi16 (simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_maddubs_epi16 - #define _mm256_maskz_maddubs_epi16(a, b) simde_mm256_maskz_maddubs_epi16(a, b) + #define _mm256_maskz_maddubs_epi16(k, a, b) simde_mm256_maskz_maddubs_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -136,7 +136,7 @@ simde_mm512_mask_maddubs_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_maddubs_epi16 - #define _mm512_mask_maddubs_epi16(a, b) simde_mm512_mask_maddubs_epi16(a, b) + #define _mm512_mask_maddubs_epi16(src, k, a, b) simde_mm512_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -150,7 +150,7 @@ simde_mm512_maskz_maddubs_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_maddubs_epi16 - #define _mm512_maskz_maddubs_epi16(a, b) simde_mm512_maskz_maddubs_epi16(a, b) + #define _mm512_maskz_maddubs_epi16(k, a, b) simde_mm512_maskz_maddubs_epi16(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/max.h b/x86/avx512/max.h index 8bec526ad..29ef0b37c 100644 --- a/x86/avx512/max.h +++ b/x86/avx512/max.h @@ -553,6 +553,30 @@ simde_mm512_max_pd (simde__m512d a, simde__m512d b) { #define _mm512_max_pd(a, b) simde_mm512_max_pd(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_max_ph (simde__m512h a, simde__m512h b) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_max_ph(a, b); + #else + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_max_ph + #define _mm512_max_ph(a, b) simde_mm512_max_ph(a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_max_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { diff --git a/x86/avx512/min.h b/x86/avx512/min.h index 03ee638b4..2e1dd8437 100644 --- a/x86/avx512/min.h +++ b/x86/avx512/min.h @@ -581,6 +581,30 @@ simde_mm512_maskz_min_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { #define _mm512_maskz_min_pd(k, a, b) simde_mm512_maskz_min_pd(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_min_ph (simde__m512h a, simde__m512h b) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_min_ph(a, b); + #else + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_min_ph + #define _mm512_min_ph(a, b) simde_mm512_min_ph(a, b) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/mov.h b/x86/avx512/mov.h index 25d5e49b9..cee9dbb37 100644 --- a/x86/avx512/mov.h +++ b/x86/avx512/mov.h @@ -451,6 +451,12 @@ simde_mm512_mask_mov_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) { #define _mm512_mask_mov_ps(src, k, a) simde_mm512_mask_mov_ps(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_x_mm512_mask_mov_ph (simde__m512h src, simde__mmask32 k, simde__m512h a) { + return simde_mm512_castsi512_ph(simde_mm512_mask_mov_epi16(simde_mm512_castph_si512(src), k, simde_mm512_castph_si512(a))); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_maskz_mov_epi8 (simde__mmask16 k, simde__m128i a) { diff --git a/x86/avx512/multishift.h b/x86/avx512/multishift.h index e6a6c0979..5388d0d07 100644 --- a/x86/avx512/multishift.h +++ b/x86/avx512/multishift.h @@ -57,7 +57,7 @@ simde_mm_maskz_multishift_epi64_epi8 (simde__mmask16 k, simde__m128i a, simde__m } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_multishift_epi64_epi8 - #define _mm_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm_maskz_multishift_epi64_epi8(k, a, b) simde_mm_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -109,7 +109,7 @@ simde_mm256_maskz_multishift_epi64_epi8 (simde__mmask32 k, simde__m256i a, simde } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_multishift_epi64_epi8 - #define _mm256_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm256_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm256_maskz_multishift_epi64_epi8(k, a, b) simde_mm256_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -161,7 +161,7 @@ simde_mm512_maskz_multishift_epi64_epi8 (simde__mmask64 k, simde__m512i a, simde } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_multishift_epi64_epi8 - #define _mm512_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm512_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm512_maskz_multishift_epi64_epi8(k, a, b) simde_mm512_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/permutex.h b/x86/avx512/permutex.h new file mode 100644 index 000000000..91c35cc21 --- /dev/null +++ b/x86/avx512/permutex.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_PERMUTEX_H) +#define SIMDE_X86_AVX512_PERMUTEX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_permutex_epi64 (simde__m256i a, const int imm8) { + simde__m256i_private + a_ = simde__m256i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = a_.i64[(imm8 >> (i*2)) & 3]; + } + + return simde__m256i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_permutex_epi64(a, imm8) _mm256_permutex_epi64((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_permutex_epi64 + #define _mm256_permutex_epi64(a, imm8) simde_mm256_permutex_epi64((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_permutex_epi64 (simde__m512i a, const int imm8) { + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.m256i_private[0].i64) / sizeof(r_.m256i_private[0].i64[0])) ; i++) { + r_.m256i_private[0].i64[i] = a_.m256i_private[0].i64[(imm8 >> (i*2)) & 3]; + r_.m256i_private[1].i64[i] = a_.m256i_private[1].i64[(imm8 >> (i*2)) & 3]; + } + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_permutex_epi64(a, imm8) _mm512_permutex_epi64((a), (imm8)) +#elif defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_permutex_epi64(a, imm8) SIMDE_STATEMENT_EXPR_(({\ + simde__m512i_private simde_mm512_permutex_epi64_a_ = simde__m512i_to_private((a)), simde_mm512_permutex_epi64_r_; \ + simde_mm512_permutex_epi64_r_.m256i[0] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[0], (imm8)); \ + simde_mm512_permutex_epi64_r_.m256i[1] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[1], (imm8)); \ + simde__m512i_from_private(simde_mm512_permutex_epi64_r_); \ + })) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_permutex_epi64 + #define _mm512_permutex_epi64(a, imm8) simde_mm512_permutex_epi64((a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) _mm512_mask_permutex_epi64((src), (k), (a), (imm8)) +#else + #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_permutex_epi64((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_permutex_epi64 + #define _mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_permutex_epi64((src), (k), (a), (imm8)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_PERMUTEX_H) */ diff --git a/x86/avx512/permutex2var.h b/x86/avx512/permutex2var.h index b6cfc80dd..b6480c200 100644 --- a/x86/avx512/permutex2var.h +++ b/x86/avx512/permutex2var.h @@ -703,8 +703,8 @@ simde_mm256_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__m256i b _mm256_castsi256_ps(tb), _mm256_castsi256_ps(select))); - lo = HEDLEY_REINTERPRET_CAST(__m256i, _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55)); - hi = HEDLEY_REINTERPRET_CAST(__m256i, _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55)); + lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55); + hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55); select = _mm256_cmpeq_epi16(_mm256_and_si256(idx, ones), ones); return _mm256_blendv_epi8(lo, hi, select); @@ -1178,8 +1178,8 @@ simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b _mm256_castsi256_ps(hilo2), _mm256_castsi256_ps(select))); - lo = HEDLEY_REINTERPRET_CAST(__m256i, _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55)); - hi = HEDLEY_REINTERPRET_CAST(__m256i, _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55)); + lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55); + hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55); select = _mm256_cmpeq_epi16(_mm256_and_si256(idx1, ones), ones); r_.m256i[i] = _mm256_blendv_epi8(lo, hi, select); diff --git a/x86/avx512/permutexvar.h b/x86/avx512/permutexvar.h index 617237236..1b4bf7ac6 100644 --- a/x86/avx512/permutexvar.h +++ b/x86/avx512/permutexvar.h @@ -1146,6 +1146,20 @@ simde_mm512_permutexvar_ps (simde__m512i idx, simde__m512 a) { #define _mm512_permutexvar_ps(idx, a) simde_mm512_permutexvar_ps(idx, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_permutexvar_ph (simde__m512i idx, simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_permutexvar_ph(idx, a); + #else + return simde_mm512_castsi512_ph(simde_mm512_permutexvar_epi16(idx, simde_mm512_castph_si512(a))); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_permutexvar_ph + #define _mm512_permutexvar_ph(idx, a) simde_mm512_permutexvar_ph(idx, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_permutexvar_ps (simde__m512 src, simde__mmask16 k, simde__m512i idx, simde__m512 a) { diff --git a/x86/avx512/range.h b/x86/avx512/range.h index 5361aa367..1d8c0fb49 100644 --- a/x86/avx512/range.h +++ b/x86/avx512/range.h @@ -128,7 +128,7 @@ simde_mm256_range_ps (simde__m256 a, simde__m256 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_range_ps_r_, \ + simde_mm256_range_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_range_ps_a_ = simde__m256_to_private(a), \ simde_mm256_range_ps_b_ = simde__m256_to_private(b); \ \ @@ -208,7 +208,7 @@ simde_mm512_range_ps (simde__m512 a, simde__m512 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_range_ps_r_, \ + simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ \ @@ -221,7 +221,7 @@ simde_mm512_range_ps (simde__m512 a, simde__m512 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_range_ps_r_, \ + simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ \ @@ -368,7 +368,7 @@ simde_mm256_range_pd (simde__m256d a, simde__m256d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_range_pd_r_, \ + simde_mm256_range_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_range_pd_a_ = simde__m256d_to_private(a), \ simde_mm256_range_pd_b_ = simde__m256d_to_private(b); \ \ @@ -448,7 +448,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_range_pd_r_, \ + simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ \ @@ -461,7 +461,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_range_pd_r_, \ + simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ \ @@ -615,7 +615,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_ss - #define _mm_maskz_range_ss(k, a, b, imm8) simde_mm_mask_range_ss(k, a, b, imm8) + #define _mm_maskz_range_ss(k, a, b, imm8) simde_mm_maskz_range_ss(k, a, b, imm8) #endif #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) @@ -736,7 +736,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_sd - #define _mm_maskz_range_sd(k, a, b, imm8) simde_mm_mask_range_sd(k, a, b, imm8) + #define _mm_maskz_range_sd(k, a, b, imm8) simde_mm_maskz_range_sd(k, a, b, imm8) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/range_round.h b/x86/avx512/range_round.h index 6f4a7b6b8..7bf132075 100644 --- a/x86/avx512/range_round.h +++ b/x86/avx512/range_round.h @@ -117,7 +117,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_range_round_ps - #define _mm512_mask_range_round_ps(src, k, a, b, imm8) simde_mm512_mask_range_round_ps(src, k, a, b, imm8) + #define _mm512_mask_range_round_ps(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -173,7 +173,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_range_round_ps - #define _mm512_maskz_range_round_ps(k, a, b, imm8) simde_mm512_maskz_range_round_ps(k, a, b, imm8) + #define _mm512_maskz_range_round_ps(k, a, b, imm8, sae) simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -285,7 +285,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_range_round_pd - #define _mm512_mask_range_round_pd(src, k, a, b, imm8) simde_mm512_mask_range_round_pd(src, k, a, b, imm8) + #define _mm512_mask_range_round_pd(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -341,7 +341,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_range_round_pd - #define _mm512_maskz_range_round_pd(k, a, b, imm8) simde_mm512_maskz_range_round_pd(k, a, b, imm8) + #define _mm512_maskz_range_round_pd(k, a, b, imm8, sae) simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -453,7 +453,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_mask_range_round_ss - #define _mm_mask_range_round_ss(src, k, a, b, imm8) simde_mm_mask_range_round_ss(src, k, a, b, imm8) + #define _mm_mask_range_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -509,7 +509,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_round_ss - #define _mm_maskz_range_round_ss(k, a, b, imm8) simde_mm_maskz_range_round_ss(k, a, b, imm8) + #define _mm_maskz_range_round_ss(k, a, b, imm8, sae) simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -621,7 +621,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_mask_range_round_sd - #define _mm_mask_range_round_sd(src, k, a, b, imm8) simde_mm_mask_range_round_sd(src, k, a, b, imm8) + #define _mm_mask_range_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -677,7 +677,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_round_sd - #define _mm_maskz_range_round_sd(k, a, b, imm8) simde_mm_maskz_range_round_sd(k, a, b, imm8) + #define _mm_maskz_range_round_sd(k, a, b, imm8, sae) simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) #endif SIMDE_END_DECLS_ diff --git a/x86/avx512/rcp.h b/x86/avx512/rcp.h new file mode 100644 index 000000000..b1b394cfe --- /dev/null +++ b/x86/avx512/rcp.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_RCP_H) +#define SIMDE_X86_AVX512_RCP_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// TODO: "The maximum relative error for this approximation is less than 2^-14." +// vs 1.5*2^-12 for _mm{,256}_rcp_ps + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_rcp14_ps (simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_rcp14_ps(a); + #else + simde__m512_private + r_, + a_ = simde__m512_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i]; + } + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_rcp14_ps + #define _mm512_rcp14_ps(a) simde_mm512_rcp14_ps(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_RCP_H) */ diff --git a/x86/avx512/reduce.h b/x86/avx512/reduce.h new file mode 100644 index 000000000..c007572e2 --- /dev/null +++ b/x86/avx512/reduce.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_REDUCE_H) +#define SIMDE_X86_AVX512_REDUCE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_mm512_reduce_max_ph(simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_reduce_max_ph(a); + #else + simde__m512h_private a_; + simde_float16 r; + a_ = simde__m512h_to_private(a); + + r = SIMDE_NINFINITYHF; + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE_REDUCTION(max:r) + #endif + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(r) ? a_.f16[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_ph(a) simde_mm512_reduce_max_ph((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_mm512_reduce_min_ph(simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_reduce_min_ph(a); + #else + simde__m512h_private a_; + simde_float16 r; + a_ = simde__m512h_to_private(a); + + r = SIMDE_INFINITYHF; + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE_REDUCTION(min:r) + #endif + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(r) ? a_.f16[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_ph(a) simde_mm512_reduce_min_ph((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_max_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = -INT32_MAX; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r = a_.i32[i] > r ? a_.i32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epi32(a) simde_mm512_reduce_max_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_max_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = -INT64_MAX; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r = a_.i64[i] > r ? a_.i64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epi64(a) simde_mm512_reduce_max_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_mm512_reduce_max_epu32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epu32(a); + #else + simde__m512i_private a_; + uint32_t r; + a_ = simde__m512i_to_private(a); + + r = 0; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r = a_.u32[i] > r ? a_.u32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epu32(a) simde_mm512_reduce_max_epu32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_mm512_reduce_max_epu64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epu64(a); + #else + simde__m512i_private a_; + uint64_t r; + a_ = simde__m512i_to_private(a); + + r = 0; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r = a_.u64[i] > r ? a_.u64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epu64(a) simde_mm512_reduce_max_epu64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 +simde_mm512_reduce_max_pd(simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_pd(a); + #else + simde__m512d_private a_; + simde_float64 r; + a_ = simde__m512d_to_private(a); + + r = -SIMDE_MATH_INFINITY; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r = a_.f64[i] > r ? a_.f64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_pd(a) simde_mm512_reduce_max_pd((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32 +simde_mm512_reduce_max_ps(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_ps(a); + #else + simde__m512_private a_; + simde_float32 r; + a_ = simde__m512_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r = a_.f32[i] > r ? a_.f32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_ps(a) simde_mm512_reduce_max_ps((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_min_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = INT32_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r = a_.i32[i] < r ? a_.i32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epi32(a) simde_mm512_reduce_min_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_min_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = INT64_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r = a_.i64[i] < r ? a_.i64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epi64(a) simde_mm512_reduce_min_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_mm512_reduce_min_epu32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epu32(a); + #else + simde__m512i_private a_; + uint32_t r; + a_ = simde__m512i_to_private(a); + + r = UINT32_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r = a_.u32[i] < r ? a_.u32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epu32(a) simde_mm512_reduce_min_epu32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_mm512_reduce_min_epu64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epu64(a); + #else + simde__m512i_private a_; + uint64_t r; + a_ = simde__m512i_to_private(a); + + r = UINT64_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r = a_.u64[i] < r ? a_.u64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epu64(a) simde_mm512_reduce_min_epu64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 +simde_mm512_reduce_min_pd(simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_pd(a); + #else + simde__m512d_private a_; + simde_float64 r; + a_ = simde__m512d_to_private(a); + + r = SIMDE_MATH_INFINITY; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r = a_.f64[i] < r ? a_.f64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_pd(a) simde_mm512_reduce_min_pd((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32 +simde_mm512_reduce_min_ps(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_ps(a); + #else + simde__m512_private a_; + simde_float32 r; + a_ = simde__m512_to_private(a); + + r = SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r = a_.f32[i] < r ? a_.f32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_ps(a) simde_mm512_reduce_min_ps((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_REDUCE_H) */ diff --git a/x86/avx512/rol.h b/x86/avx512/rol.h index 835bf6bbb..5bdf98bc1 100644 --- a/x86/avx512/rol.h +++ b/x86/avx512/rol.h @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_rol_epi32 - #define _mm_maskz_rol_epi32(src, k, a, imm8) simde_mm_maskz_rol_epi32(src, k, a, imm8) + #define _mm_maskz_rol_epi32(k, a, imm8) simde_mm_maskz_rol_epi32(k, a, imm8) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) diff --git a/x86/avx512/ror.h b/x86/avx512/ror.h index 464f71f0f..7cac56c7e 100644 --- a/x86/avx512/ror.h +++ b/x86/avx512/ror.h @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_ror_epi32 - #define _mm_maskz_ror_epi32(src, k, a, imm8) simde_mm_maskz_ror_epi32(src, k, a, imm8) + #define _mm_maskz_ror_epi32(k, a, imm8) simde_mm_maskz_ror_epi32(k, a, imm8) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) diff --git a/x86/avx512/round.h b/x86/avx512/round.h index 954e348c1..684dbe045 100644 --- a/x86/avx512/round.h +++ b/x86/avx512/round.h @@ -10,7 +10,7 @@ SIMDE_BEGIN_DECLS_ #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_x_mm512_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_x_mm512_round_ps_r_, \ + simde_x_mm512_round_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_x_mm512_round_ps_a_ = simde__m512_to_private(a); \ \ for (size_t simde_x_mm512_round_ps_i = 0 ; simde_x_mm512_round_ps_i < (sizeof(simde_x_mm512_round_ps_r_.m256) / sizeof(simde_x_mm512_round_ps_r_.m256[0])) ; simde_x_mm512_round_ps_i++) { \ @@ -148,7 +148,7 @@ SIMDE_BEGIN_DECLS_ #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_x_mm512_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_x_mm512_round_pd_r_, \ + simde_x_mm512_round_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_x_mm512_round_pd_a_ = simde__m512d_to_private(a); \ \ for (size_t simde_x_mm512_round_pd_i = 0 ; simde_x_mm512_round_pd_i < (sizeof(simde_x_mm512_round_pd_r_.m256d) / sizeof(simde_x_mm512_round_pd_r_.m256d[0])) ; simde_x_mm512_round_pd_i++) { \ diff --git a/x86/avx512/roundscale.h b/x86/avx512/roundscale.h index b44923c24..80c9abf2b 100644 --- a/x86/avx512/roundscale.h +++ b/x86/avx512/roundscale.h @@ -18,7 +18,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_roundscale_ps_internal_ (simde__m128 result, simde__m128 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m128 r, clear_sign; @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_roundscale_ps_r_, \ + simde_mm256_roundscale_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_roundscale_ps_a_ = simde__m256_to_private(a); \ \ for (size_t simde_mm256_roundscale_ps_i = 0 ; simde_mm256_roundscale_ps_i < (sizeof(simde_mm256_roundscale_ps_r_.m128) / sizeof(simde_mm256_roundscale_ps_r_.m128[0])) ; simde_mm256_roundscale_ps_i++) { \ @@ -85,7 +85,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m256 - simde_mm256_roundscale_ps_internal_ (simde__m256 result, simde__m256 a, int imm8) + simde_mm256_roundscale_ps_internal_ (simde__m256 result, simde__m256 a, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); @@ -141,7 +141,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_roundscale_ps_r_, \ + simde_mm512_roundscale_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_roundscale_ps_a_ = simde__m512_to_private(a); \ \ for (size_t simde_mm512_roundscale_ps_i = 0 ; simde_mm512_roundscale_ps_i < (sizeof(simde_mm512_roundscale_ps_r_.m256) / sizeof(simde_mm512_roundscale_ps_r_.m256[0])) ; simde_mm512_roundscale_ps_i++) { \ @@ -154,7 +154,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_roundscale_ps_internal_ (simde__m512 result, simde__m512 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m512 r, clear_sign; @@ -210,7 +210,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_roundscale_pd_internal_ (simde__m128d result, simde__m128d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m128d r, clear_sign; @@ -265,7 +265,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_roundscale_pd_r_, \ + simde_mm256_roundscale_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_roundscale_pd_a_ = simde__m256d_to_private(a); \ \ for (size_t simde_mm256_roundscale_pd_i = 0 ; simde_mm256_roundscale_pd_i < (sizeof(simde_mm256_roundscale_pd_r_.m128d) / sizeof(simde_mm256_roundscale_pd_r_.m128d[0])) ; simde_mm256_roundscale_pd_i++) { \ @@ -278,7 +278,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_roundscale_pd_internal_ (simde__m256d result, simde__m256d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m256d r, clear_sign; @@ -333,7 +333,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_roundscale_pd_r_, \ + simde_mm512_roundscale_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_roundscale_pd_a_ = simde__m512d_to_private(a); \ \ for (size_t simde_mm512_roundscale_pd_i = 0 ; simde_mm512_roundscale_pd_i < (sizeof(simde_mm512_roundscale_pd_r_.m256d) / sizeof(simde_mm512_roundscale_pd_r_.m256d[0])) ; simde_mm512_roundscale_pd_i++) { \ @@ -346,7 +346,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_roundscale_pd_internal_ (simde__m512d result, simde__m512d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m512d r, clear_sign; @@ -401,7 +401,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_roundscale_ss_internal_ (simde__m128 result, simde__m128 b, int imm8) + simde_mm_roundscale_ss_internal_ (simde__m128 result, simde__m128 b, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); @@ -508,7 +508,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_roundscale_sd_internal_ (simde__m128d result, simde__m128d b, int imm8) + simde_mm_roundscale_sd_internal_ (simde__m128d result, simde__m128d b, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); diff --git a/x86/avx512/roundscale_round.h b/x86/avx512/roundscale_round.h index debc11330..f941e48da 100644 --- a/x86/avx512/roundscale_round.h +++ b/x86/avx512/roundscale_round.h @@ -8,6 +8,11 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning( push ) +#pragma warning( disable : 4244 ) +#endif + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_roundscale_round_ps(a, imm8, sae) _mm512_roundscale_round_ps(a, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) @@ -37,8 +42,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_roundscale_round_ps (simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -93,8 +97,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_roundscale_round_ps (simde__m512 src, simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -149,8 +152,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_maskz_roundscale_round_ps (simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -205,8 +207,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_roundscale_round_pd (simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -261,8 +262,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_roundscale_round_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -317,8 +317,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_maskz_roundscale_round_pd (simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -369,10 +368,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_roundscale_round_ss(a, b, imm8, sae) simde_mm_roundscale_ss(a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_roundscale_round_ss (simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_roundscale_round_ss (simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -425,10 +424,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_ss(src, k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_mask_roundscale_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_mask_roundscale_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -481,10 +480,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_ss(k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_maskz_roundscale_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_maskz_roundscale_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -512,6 +511,11 @@ SIMDE_BEGIN_DECLS_ #define _mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) #endif +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning( pop ) +#endif + + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm_roundscale_round_sd(a, b, imm8, sae) _mm_roundscale_round_sd(a, b, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) @@ -537,10 +541,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_roundscale_round_sd(a, b, imm8, sae) simde_mm_roundscale_sd(a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_roundscale_round_sd (simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_roundscale_round_sd (simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; @@ -593,10 +597,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_sd(src, k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_mask_roundscale_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_mask_roundscale_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; @@ -649,10 +653,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) simde_mm_maskz_roundscale_sd(k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_maskz_roundscale_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_maskz_roundscale_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; diff --git a/x86/avx512/set.h b/x86/avx512/set.h index 1e681af68..d87a72ce3 100644 --- a/x86/avx512/set.h +++ b/x86/avx512/set.h @@ -401,7 +401,7 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__m128i d) { #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_LIKE_16(simde__m128i) simde__m128i v[] = { d, c, b, a }; + SIMDE_ALIGN_TO_64 simde__m128i v[] = { d, c, b, a }; return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); #else simde__m512i_private r_; @@ -415,11 +415,27 @@ simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__ #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_x_mm512_set_m256 (simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + SIMDE_ALIGN_TO_64 simde__m256 v[] = { b, a }; + return simde_mm512_load_ps(HEDLEY_STATIC_CAST(__m512 *, HEDLEY_STATIC_CAST(void *, v))); + #else + simde__m512_private r_; + + r_.m256[0] = b; + r_.m256[1] = a; + + return simde__m512_from_private(r_); + #endif +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_LIKE_32(simde__m256i) simde__m256i v[] = { b, a }; + SIMDE_ALIGN_TO_64 simde__m256i v[] = { b, a }; return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); #else simde__m512i_private r_; @@ -431,6 +447,22 @@ simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) { #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_x_mm512_set_m256d (simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + SIMDE_ALIGN_TO_64 simde__m256d v[] = { b, a }; + return simde_mm512_load_pd(HEDLEY_STATIC_CAST(__m512d *, HEDLEY_STATIC_CAST(void *, v))); + #else + simde__m512d_private r_; + + r_.m256d[0] = b; + r_.m256d[1] = a; + + return simde__m512d_from_private(r_); + #endif +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_set_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12, @@ -484,6 +516,56 @@ simde_mm512_set_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_ #define _mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_set_ph (simde_float16 e31, simde_float16 e30, simde_float16 e29, simde_float16 e28, simde_float16 e27, simde_float16 e26, simde_float16 e25, simde_float16 e24, + simde_float16 e23, simde_float16 e22, simde_float16 e21, simde_float16 e20, simde_float16 e19, simde_float16 e18, simde_float16 e17, simde_float16 e16, + simde_float16 e15, simde_float16 e14, simde_float16 e13, simde_float16 e12, simde_float16 e11, simde_float16 e10, simde_float16 e9, simde_float16 e8, + simde_float16 e7, simde_float16 e6, simde_float16 e5, simde_float16 e4, simde_float16 e3, simde_float16 e2, simde_float16 e1, simde_float16 e0) { + simde__m512h_private r_; + + r_.f16[0] = e0; + r_.f16[1] = e1; + r_.f16[2] = e2; + r_.f16[3] = e3; + r_.f16[4] = e4; + r_.f16[5] = e5; + r_.f16[6] = e6; + r_.f16[7] = e7; + r_.f16[8] = e8; + r_.f16[9] = e9; + r_.f16[10] = e10; + r_.f16[11] = e11; + r_.f16[12] = e12; + r_.f16[13] = e13; + r_.f16[14] = e14; + r_.f16[15] = e15; + r_.f16[16] = e16; + r_.f16[17] = e17; + r_.f16[18] = e18; + r_.f16[19] = e19; + r_.f16[20] = e20; + r_.f16[21] = e21; + r_.f16[22] = e22; + r_.f16[23] = e23; + r_.f16[24] = e24; + r_.f16[25] = e25; + r_.f16[26] = e26; + r_.f16[27] = e27; + r_.f16[28] = e28; + r_.f16[29] = e29; + r_.f16[30] = e30; + r_.f16[31] = e31; + + return simde__m512h_from_private(r_); +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_set_ph + #define _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) +#endif + + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/set1.h b/x86/avx512/set1.h index 82c9c8cca..33ae84183 100644 --- a/x86/avx512/set1.h +++ b/x86/avx512/set1.h @@ -325,6 +325,27 @@ simde_mm512_set1_pd (simde_float64 a) { #define _mm512_set1_pd(a) simde_mm512_set1_pd(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_set1_ph (simde_float16 a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_set1_ph(a); + #else + simde__m512h_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = a; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_set1_ph + #define _mm512_set1_ph(a) simde_mm512_set1_ph(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/setone.h b/x86/avx512/setone.h index 087dbb567..df2f6e8bb 100644 --- a/x86/avx512/setone.h +++ b/x86/avx512/setone.h @@ -60,6 +60,12 @@ simde_x_mm512_setone_pd(void) { return simde_mm512_castsi512_pd(simde_x_mm512_setone_si512()); } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_x_mm512_setone_ph(void) { + return simde_mm512_castsi512_ph(simde_x_mm512_setone_si512()); +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/setzero.h b/x86/avx512/setzero.h index c34381735..c5bfdc458 100644 --- a/x86/avx512/setzero.h +++ b/x86/avx512/setzero.h @@ -66,8 +66,8 @@ simde_mm512_setzero_ps(void) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_si512 - #define _mm512_setzero_si512() simde_mm512_setzero_si512() + #undef _mm512_setzero_ps + #define _mm512_setzero_ps() simde_mm512_setzero_ps() #endif SIMDE_FUNCTION_ATTRIBUTES @@ -80,10 +80,25 @@ simde_mm512_setzero_pd(void) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_si512 - #define _mm512_setzero_si512() simde_mm512_setzero_si512() + #undef _mm512_setzero_pd + #define _mm512_setzero_pd() simde_mm512_setzero_pd() #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_setzero_ph(void) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_setzero_ph(); + #else + return simde_mm512_castsi512_ph(simde_mm512_setzero_si512()); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_setzero_ph + #define _mm512_setzero_ph() simde_mm512_setzero_ph() +#endif + + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/shuffle.h b/x86/avx512/shuffle.h index 93fc577af..14faee1bf 100644 --- a/x86/avx512/shuffle.h +++ b/x86/avx512/shuffle.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_SHUFFLE_H) @@ -31,6 +32,7 @@ #include "types.h" #include "../avx2.h" #include "mov.h" +#include "extract.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -53,8 +55,8 @@ simde_mm512_shuffle_epi8 (simde__m512i a, simde__m512i b) { } #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (b_.u8[i] & 0x80) ? 0 : a_.u8[(b_.u8[i] & 0x0f) + (i & 0x30)]; + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (b_.i8[i] & 0x80) ? 0 : a_.i8[(b_.i8[i] & 0x0f) + (i & 0x30)]; } #endif @@ -94,6 +96,35 @@ simde_mm512_maskz_shuffle_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b #define _mm512_maskz_shuffle_epi8(k, a, b) simde_mm512_maskz_shuffle_epi8(k, a, b) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) +# define simde_mm512_shuffle_epi32(a, imm8) _mm512_shuffle_epi32((a), HEDLEY_STATIC_CAST(_MM_PERM_ENUM, (imm8))) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shuffle_epi32(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shuffle_epi32_r_, \ + simde_mm512_shuffle_epi32_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shuffle_epi32_r_.m128i[0] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[0], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[1] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[1], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[2] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[2], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[3] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shuffle_epi32_r_); \ + })) +#else +# define simde_mm512_shuffle_epi32(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 3), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 2), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 1), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_shuffle_epi32 + #define _mm512_shuffle_epi32(a, imm8) simde_mm512_shuffle_epi32((a), (imm8)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_shuffle_i32x4 (simde__m256i a, simde__m256i b, const int imm8) @@ -131,6 +162,34 @@ simde_mm256_shuffle_i32x4 (simde__m256i a, simde__m256i b, const int imm8) #define simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_shuffle_f64x2(a, b, imm8)) #define simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm256_shuffle_f64x2(a, b, imm8)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_shuffle_i32x4 + #undef _mm256_mask_shuffle_i32x4 + #define _mm256_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_i32x4(k, a, b, imm8) + #define _mm256_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_i32x4(src, k, a, b, imm8) + + #undef _mm256_shuffle_f32x4 + #undef _mm256_maskz_shuffle_f32x4 + #undef _mm256_mask_shuffle_f32x4 + #define _mm256_shuffle_f32x4(a, b, imm8) simde_mm256_shuffle_f32x4(a, b, imm8) + #define _mm256_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_f32x4(k, a, b, imm8) + #define _mm256_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_f32x4(src, k, a, b, imm8) + + #undef _mm256_shuffle_i64x2 + #undef _mm256_maskz_shuffle_i64x2 + #undef _mm256_mask_shuffle_i64x2 + #define _mm256_shuffle_i64x2(a, b, imm8) simde_mm256_shuffle_i64x2(a, b, imm8) + #define _mm256_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_i64x2(k, a, b, imm8) + #define _mm256_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_i64x2(src, k, a, b, imm8) + + #undef _mm256_shuffle_f64x2 + #undef _mm256_maskz_shuffle_f64x2 + #undef _mm256_mask_shuffle_f64x2 + #define _mm256_shuffle_f64x2(a, b, imm8) simde_mm256_shuffle_f64x2(a, b, imm8) + #define _mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) + #define _mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) @@ -170,6 +229,34 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) #define simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_shuffle_f64x2(a, b, imm8)) #define simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_mov_pd(src, k, simde_mm512_shuffle_f64x2(a, b, imm8)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_shuffle_i32x4 + #undef _mm512_mask_shuffle_i32x4 + #define _mm512_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_i32x4(k, a, b, imm8) + #define _mm512_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_i32x4(src, k, a, b, imm8) + + #undef _mm512_shuffle_f32x4 + #undef _mm512_maskz_shuffle_f32x4 + #undef _mm512_mask_shuffle_f32x4 + #define _mm512_shuffle_f32x4(a, b, imm8) simde_mm512_shuffle_f32x4(a, b, imm8) + #define _mm512_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_f32x4(k, a, b, imm8) + #define _mm512_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_f32x4(src, k, a, b, imm8) + + #undef _mm512_shuffle_i64x2 + #undef _mm512_maskz_shuffle_i64x2 + #undef _mm512_mask_shuffle_i64x2 + #define _mm512_shuffle_i64x2(a, b, imm8) simde_mm512_shuffle_i64x2(a, b, imm8) + #define _mm512_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_i64x2(k, a, b, imm8) + #define _mm512_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_i64x2(src, k, a, b, imm8) + + #undef _mm512_shuffle_f64x2 + #undef _mm512_maskz_shuffle_f64x2 + #undef _mm512_mask_shuffle_f64x2 + #define _mm512_shuffle_f64x2(a, b, imm8) simde_mm512_shuffle_f64x2(a, b, imm8) + #define _mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) + #define _mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) +#endif + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_shuffle_ps(a, b, imm8) _mm512_shuffle_ps(a, b, imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) @@ -224,8 +311,8 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) a_ = simde__m512_to_private(a), b_ = simde__m512_to_private(b); + const size_t halfway = (sizeof(r_.m128_private[0].f32) / sizeof(r_.m128_private[0].f32[0]) / 2); for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - const size_t halfway = (sizeof(r_.m128_private[i].f32) / sizeof(r_.m128_private[i].f32[0]) / 2); SIMDE_VECTORIZE for (size_t j = 0 ; j < halfway ; j++) { r_.m128_private[i].f32[j] = a_.m128_private[i].f32[(imm8 >> (j * 2)) & 3]; @@ -241,6 +328,89 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) #define _mm512_shuffle_ps(a, b, imm8) simde_mm512_shuffle_ps(a, b, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_shuffle_pd(simde__m512d a, simde__m512d b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { + simde__m512d_private + r_, + a_ = simde__m512d_to_private(a), + b_ = simde__m512d_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r_.f64) / sizeof(r_.f64[0])) / 2) ; i++) { + r_.f64[i * 2] = (imm8 & ( 1 << (i*2) )) ? a_.f64[i * 2 + 1]: a_.f64[i * 2]; + r_.f64[i * 2 + 1] = (imm8 & ( 1 << (i*2+1) )) ? b_.f64[i * 2 + 1]: b_.f64[i * 2]; + } + + return simde__m512d_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_shuffle_pd(a, b, imm8) _mm512_shuffle_pd(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_shuffle_pd + #define _mm512_shuffle_pd(a, b, imm8) simde_mm512_shuffle_pd(a, b, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) +# define simde_mm512_shufflehi_epi16(a, imm8) _mm512_shufflehi_epi16(a, imm8) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shufflehi_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shufflehi_epi16_r_, \ + simde_mm512_shufflehi_epi16_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shufflehi_epi16_r_.m128i[0] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[0], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[1] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[1], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[2] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[2], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[3] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shufflehi_epi16_r_); \ + })) +#else +# define simde_mm512_shufflehi_epi16(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_shufflehi_epi16 + #define _mm512_shufflehi_epi16(a, imm8) simde_mm512_shufflehi_epi16(a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) +# define simde_mm512_shufflelo_epi16(a, imm8) _mm512_shufflelo_epi16(a, imm8) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shufflelo_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shufflelo_epi16_r_, \ + simde_mm512_shufflelo_epi16_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shufflelo_epi16_r_.m128i[0] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[0], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[1] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[1], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[2] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[2], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[3] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shufflelo_epi16_r_); \ + })) +#else +# define simde_mm512_shufflelo_epi16(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_shufflelo_epi16 + #define _mm512_shufflelo_epi16(a, imm8) simde_mm512_shufflelo_epi16(a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/sll.h b/x86/avx512/sll.h index 8cc944648..18fbbb8ce 100644 --- a/x86/avx512/sll.h +++ b/x86/avx512/sll.h @@ -102,7 +102,7 @@ simde_mm512_maskz_sll_epi16 (simde__mmask32 k, simde__m512i a, simde__m128i coun } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_sll_epi16 - #define _mm512_maskz_sll_epi16(src, k, a, count) simde_mm512_maskz_sll_epi16(src, k, a, count) + #define _mm512_maskz_sll_epi16(k, a, count) simde_mm512_maskz_sll_epi16(k, a, count) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/x86/avx512/srai.h b/x86/avx512/srai.h index e7ba354aa..4fcbd95c0 100644 --- a/x86/avx512/srai.h +++ b/x86/avx512/srai.h @@ -64,6 +64,32 @@ simde_mm512_srai_epi16 (simde__m512i a, const int imm8) { #define _mm512_srai_epi16(a, imm8) simde_mm512_srai_epi16(a, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_srai_epi32 (simde__m512i a, const unsigned int imm8) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int32_t, imm8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = a_.i32[i] >> imm8; + } + #endif + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) +# define simde_mm512_srai_epi32(a, imm8) _mm512_srai_epi32(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_srai_epi32 + #define _mm512_srai_epi32(a, imm8) simde_mm512_srai_epi32(a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/storeu.h b/x86/avx512/storeu.h index dee1db091..e00801faf 100644 --- a/x86/avx512/storeu.h +++ b/x86/avx512/storeu.h @@ -28,11 +28,45 @@ #define SIMDE_X86_AVX512_STOREU_H #include "types.h" +#include "mov.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#define simde_mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_storeu_epi8 + #undef _mm256_storeu_epi16 + #define _mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) + #define _mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_storeu_epi32 + #undef _mm256_storeu_epi64 + #define _mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) + #define _mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_epi16 (void * mem_addr, simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256i zero = simde_mm256_setzero_si256(); + simde_mm256_storeu_epi16(mem_addr, simde_mm256_mask_mov_epi16(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_epi16 + #define _mm256_mask_storeu_epi16(mem_addr, k, a) simde_mm256_mask_storeu_epi16(mem_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_storeu_ps (void * mem_addr, simde__m512 a) { @@ -61,6 +95,20 @@ simde_mm512_storeu_pd (void * mem_addr, simde__m512d a) { #define _mm512_storeu_pd(mem_addr, a) simde_mm512_storeu_pd(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_storeu_ph (void * mem_addr, simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + _mm512_storeu_ph(mem_addr, a); + #else + simde_memcpy(mem_addr, &a, sizeof(a)); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_storeu_ph + #define _mm512_storeu_ph(mem_addr, a) simde_mm512_storeu_ph(mem_addr, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { @@ -74,19 +122,96 @@ simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { #define simde_mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define simde_mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define simde_mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_storeu_epi8 #undef _mm512_storeu_epi16 + #define _mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) + #define _mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_storeu_epi32 #undef _mm512_storeu_epi64 #undef _mm512_storeu_si512 #define _mm512_storeu_si512(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define _mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define _mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi16 (void * mem_addr, simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + _mm512_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi16(mem_addr, simde_mm512_mask_mov_epi16(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi16 + #define _mm512_mask_storeu_epi16(mem_addr, k, a) simde_mm512_mask_storeu_epi16(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi32 (void * mem_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi32(mem_addr, simde_mm512_mask_mov_epi32(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi32 + #define _mm512_mask_storeu_epi32(mem_addr, k, a) simde_mm512_mask_storeu_epi32(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi64(mem_addr, simde_mm512_mask_mov_epi64(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi64 + #define _mm512_mask_storeu_epi64(mem_addr, k, a) simde_mm512_mask_storeu_epi64(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_ps (void * mem_addr, simde__mmask16 k, simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512 zero = simde_mm512_setzero_ps(); + simde_mm512_storeu_ps(mem_addr, simde_mm512_mask_mov_ps(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_ps + #define _mm512_mask_storeu_ps(mem_addr, k, a) simde_mm512_mask_storeu_ps(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_pd (void * mem_addr, simde__mmask8 k, simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512d zero = simde_mm512_setzero_pd(); + simde_mm512_storeu_pd(mem_addr, simde_mm512_mask_mov_pd(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_pd + #define _mm512_mask_storeu_pd(mem_addr, k, a) simde_mm512_mask_storeu_pd(mem_addr, k, a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/types.h b/x86/avx512/types.h index 37a07e17e..cb5ad82c0 100644 --- a/x86/avx512/types.h +++ b/x86/avx512/types.h @@ -26,8 +26,8 @@ #if !defined(SIMDE_X86_AVX512_TYPES_H) #define SIMDE_X86_AVX512_TYPES_H - #include "../avx.h" +#include "../../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -376,6 +376,73 @@ typedef union { #endif } simde__m512d_private; +typedef union { + #if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + SIMDE_AVX512_ALIGN simde_float16 f16[32]; + #endif + SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + SIMDE_AVX512_ALIGN int8_t i8[64]; + SIMDE_AVX512_ALIGN int16_t i16[32]; + SIMDE_AVX512_ALIGN int32_t i32[16]; + SIMDE_AVX512_ALIGN int64_t i64[8]; + SIMDE_AVX512_ALIGN uint8_t u8[64]; + SIMDE_AVX512_ALIGN uint16_t u16[32]; + SIMDE_AVX512_ALIGN uint32_t u32[16]; + SIMDE_AVX512_ALIGN uint64_t u64[8]; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_AVX512_ALIGN simde_int128 i128[4]; + SIMDE_AVX512_ALIGN simde_uint128 u128[4]; + #endif + SIMDE_AVX512_ALIGN simde_float16 f16[32]; + SIMDE_AVX512_ALIGN simde_float32 f32[16]; + SIMDE_AVX512_ALIGN simde_float64 f64[8]; + SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; + SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; + #endif + + SIMDE_AVX512_ALIGN simde__m128d_private m128d_private[4]; + SIMDE_AVX512_ALIGN simde__m128d m128d[4]; + SIMDE_AVX512_ALIGN simde__m256d_private m256d_private[2]; + SIMDE_AVX512_ALIGN simde__m256d m256d[2]; + + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + SIMDE_AVX512_ALIGN __m512h n; + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; + #endif + #endif +} simde__m512h_private; + + typedef union { #if defined(SIMDE_VECTOR_SUBSCRIPT) SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; @@ -460,7 +527,9 @@ typedef union { * * As for the ICC check, unlike other compilers, merely using the * AVX-512 types causes ICC to generate AVX-512 instructions. */ -#if (defined(_MM_CMPINT_GE) || defined(_MM_CMPINT_NLT)) && (defined(SIMDE_X86_AVX512F_NATIVE) || !defined(HEDLEY_INTEL_VERSION)) +#if (defined(_MM_CMPINT_GE) || defined(_MM_CMPINT_NLT)) && \ + (defined(SIMDE_X86_AVX512F_NATIVE) || \ + !(defined(HEDLEY_INTEL_VERSION) || (defined(HEDLEY_MSVC_VERSION) && !defined(__clang__)))) typedef __m512 simde__m512; typedef __m512i simde__m512i; typedef __m512d simde__m512d; @@ -476,7 +545,7 @@ typedef union { typedef simde__m512_private simde__m512; typedef simde__m512i_private simde__m512i; typedef simde__m512d_private simde__m512d; - #endif + #endif typedef uint8_t simde__mmask8; typedef uint16_t simde__mmask16; @@ -498,6 +567,16 @@ typedef union { #endif #endif +#if defined(SIMDE_X86_AVX512FP16_NATIVE) + typedef __m512h simde__m512h; +#else + #if defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_FLOAT16_VECTOR) + typedef simde_float16 simde__m512h SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + typedef simde__m512h_private simde__m512h; + #endif +#endif + /* These are really part of AVX-512VL / AVX-512BW (in GCC __mmask32 is * in avx512vlintrin.h and __mmask64 is in avx512bwintrin.h, in clang * both are in avx512bwintrin.h), not AVX-512F. However, we don't have @@ -512,6 +591,31 @@ typedef union { * issue and we'll try to figure out a work-around. */ typedef uint32_t simde__mmask32; typedef uint64_t simde__mmask64; +#if !defined(__mmask16) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + typedef uint16_t __mmask16; + #else + #define __mmask16 uint16_t + #endif +#endif +#if !defined(__mmask32) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + typedef uint32_t __mmask32; + #else + #define __mmask32 uint32_t + #endif +#endif +#if !defined(__mmask64) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + #if defined(HEDLEY_GCC_VERSION) || (defined(__clang__) && SIMDE_DETECT_CLANG_VERSION_CHECK(3, 6, 0)) + typedef unsigned long long __mmask64; + #else + typedef uint64_t __mmask64; + #endif + #else + #define __mmask64 uint64_t + #endif +#endif #if !defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) #if !defined(HEDLEY_INTEL_VERSION) @@ -537,6 +641,18 @@ typedef uint64_t simde__mmask64; #endif #endif +#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + //typedef simde__m128h __m128h; + //typedef simde__m256h __m256h; + typedef simde__m512h __m512h; + #else + //#define __m128h simde__m128h + //#define __m256h simde__m256h + #define __m512h simde__m512h + #endif +#endif + HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh), "simde__m128bh size incorrect"); HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh_private), "simde__m128bh_private size incorrect"); HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh), "simde__m256bh size incorrect"); @@ -549,6 +665,8 @@ HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i), "simde__m512i size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i_private), "simde__m512i_private size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d), "simde__m512d size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d_private), "simde__m512d_private size incorrect"); +HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h), "simde__m512h size incorrect"); +HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h_private), "simde__m512h_private size incorrect"); #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh) == 16, "simde__m128bh is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh_private) == 16, "simde__m128bh_private is not 16-byte aligned"); @@ -562,6 +680,27 @@ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i) == 32, "simde__m512i is not 32 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i_private) == 32, "simde__m512i_private is not 32-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d) == 32, "simde__m512d is not 32-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d_private) == 32, "simde__m512d_private is not 32-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h) == 32, "simde__m512h is not 32-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h_private) == 32, "simde__m512h_private is not 32-byte aligned"); +#endif + +#define SIMDE_MM_CMPINT_EQ 0 +#define SIMDE_MM_CMPINT_LT 1 +#define SIMDE_MM_CMPINT_LE 2 +#define SIMDE_MM_CMPINT_FALSE 3 +#define SIMDE_MM_CMPINT_NE 4 +#define SIMDE_MM_CMPINT_NLT 5 +#define SIMDE_MM_CMPINT_NLE 6 +#define SIMDE_MM_CMPINT_TRUE 7 +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && !defined(_MM_CMPINT_EQ) +#define _MM_CMPINT_EQ SIMDE_MM_CMPINT_EQ +#define _MM_CMPINT_LT SIMDE_MM_CMPINT_LT +#define _MM_CMPINT_LE SIMDE_MM_CMPINT_LE +#define _MM_CMPINT_FALSE SIMDE_MM_CMPINT_FALSE +#define _MM_CMPINT_NE SIMDE_MM_CMPINT_NE +#define _MM_CMPINT_NLT SIMDE_MM_CMPINT_NLT +#define _MM_CMPINT_NLE SIMDE_MM_CMPINT_NLE +#define _MM_CMPINT_TRUE SIMDE_CMPINT_TRUE #endif SIMDE_FUNCTION_ATTRIBUTES @@ -660,6 +799,22 @@ simde__m512d_to_private(simde__m512d v) { return r; } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde__m512h_from_private(simde__m512h_private v) { + simde__m512h r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h_private +simde__m512h_to_private(simde__m512h v) { + simde__m512h_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/avx512/xorsign.h b/x86/avx512/xorsign.h index 38fb5f942..f7fdc8c97 100644 --- a/x86/avx512/xorsign.h +++ b/x86/avx512/xorsign.h @@ -26,7 +26,7 @@ */ /* This is a SIMDe extension which is not part of AVX-512. It exists - * because a lot of numerical methods in SIMDe have algoriths which do + * because a lot of numerical methods in SIMDe have algorithms which do * something like: * * float sgn = input < 0 ? -1 : 1; diff --git a/x86/clmul.h b/x86/clmul.h index 5ba97d7ab..7d51b5b3f 100644 --- a/x86/clmul.h +++ b/x86/clmul.h @@ -101,6 +101,10 @@ simde_x_bitreverse_u64(uint64_t v) { return HEDLEY_STATIC_CAST(uint64_t, _mm_cvtsi128_si64(vec)); #elif HEDLEY_HAS_BUILTIN(__builtin_bitreverse64) return __builtin_bitreverse64(v); + #elif defined(__loongarch64) + uint64_t r; + __asm__ __volatile__ ("bitrev.d %0, %1" :"=&r"(r):"r"(v):); + return r; #else v = ((v >> 1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) << 1); v = ((v >> 2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) << 2); @@ -120,21 +124,7 @@ simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) b_ = simde__m128i_to_private(b), r_; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES) - uint64x1_t A = ((imm8) & 0x01) ? vget_high_u64(a_.neon_u64) : vget_low_u64(a_.neon_u64); - uint64x1_t B = ((imm8) & 0x10) ? vget_high_u64(b_.neon_u64) : vget_low_u64(b_.neon_u64); - #if defined(SIMDE_BUG_CLANG_48257) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - #endif - poly64_t A_ = vget_lane_p64(vreinterpret_p64_u64(A), 0); - poly64_t B_ = vget_lane_p64(vreinterpret_p64_u64(B), 0); - #if defined(SIMDE_BUG_CLANG_48257) - HEDLEY_DIAGNOSTIC_POP - #endif - poly128_t R = vmull_p64(A_, B_); - r_.neon_u64 = vreinterpretq_u64_p128(R); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #if SIMDE_NATURAL_VECTOR_SIZE_GE(128) #if defined(SIMDE_SHUFFLE_VECTOR_) switch (imm8 & 0x11) { case 0x00: @@ -211,9 +201,9 @@ simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ })) #else - #define simde_mm_clmulepi64_si128(a, b, imm8) simde_mm_clmulepi64_si128(a, b, imm8) + #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8) #endif -#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES) +#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && !defined(__clang__) #define simde_mm_clmulepi64_si128(a, b, imm8) \ simde__m128i_from_neon_u64( \ vreinterpretq_u64_p128( \ @@ -238,84 +228,61 @@ simde_mm256_clmulepi64_epi128 (simde__m256i a, simde__m256i b, const int imm8) b_ = simde__m256i_to_private(b), r_; - #if defined(SIMDE_X86_PCLMUL_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (imm8 & 0x11) { + simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; + + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) + switch (imm8 & 0x01) { case 0x00: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x00); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x00); + a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2); break; case 0x01: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x01); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x01); + a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3); break; - case 0x10: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x10); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x10); + } + switch (imm8 & 0x10) { + case 0x00: + b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2); break; - case 0x11: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x11); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x11); + case 0x10: + b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3); break; } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS #else - simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - switch (imm8 & 0x01) { - case 0x00: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2); - break; - case 0x01: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3); - break; - } - switch (imm8 & 0x10) { - case 0x00: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2); - break; - case 0x10: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3); - break; - } - #else - a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; - a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; - b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; - b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; - #endif + a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; + a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; + b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; + b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; + #endif - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { - a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); - b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { + a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); + b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); - r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); - r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); + r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); + r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); - r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; - } + r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; + } - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_))); - r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3); - #else - r_.u64[0] = r_lo_.u64[0]; - r_.u64[1] = r_hi_.u64[0]; - r_.u64[2] = r_lo_.u64[1]; - r_.u64[3] = r_hi_.u64[1]; - #endif + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) + r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_))); + r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3); + #else + r_.u64[0] = r_lo_.u64[0]; + r_.u64[1] = r_hi_.u64[0]; + r_.u64[2] = r_lo_.u64[1]; + r_.u64[3] = r_hi_.u64[1]; #endif return simde__m256i_from_private(r_); } -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_clmulepi64_epi128(a, b, imm8) _mm256_clmulepi64_epi128(a, b, imm8) #endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_clmulepi64_epi128 #define _mm256_clmulepi64_epi128(a, b, imm8) simde_mm256_clmulepi64_epi128(a, b, imm8) #endif @@ -409,10 +376,10 @@ simde_mm512_clmulepi64_epi128 (simde__m512i a, simde__m512i b, const int imm8) return simde__m512i_from_private(r_); } -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) +#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_clmulepi64_epi128(a, b, imm8) _mm512_clmulepi64_epi128(a, b, imm8) #endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_clmulepi64_epi128 #define _mm512_clmulepi64_epi128(a, b, imm8) simde_mm512_clmulepi64_epi128(a, b, imm8) #endif diff --git a/x86/f16c.h b/x86/f16c.h index 51ba779ac..27828a44c 100644 --- a/x86/f16c.h +++ b/x86/f16c.h @@ -43,34 +43,33 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm_cvtps_ph(simde__m128 a, const int sae) { - #if defined(SIMDE_X86_F16C_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (sae & SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_NO_EXC: - return _mm_cvtps_ph(a, SIMDE_MM_FROUND_NO_EXC); - default: - return _mm_cvtps_ph(a, 0); - } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS - #else - simde__m128_private a_ = simde__m128_to_private(a); - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); +simde_mm_cvtps_ph(simde__m128 a, const int imm8) { + simde__m128_private a_ = simde__m128_to_private(a); + simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - HEDLEY_STATIC_CAST(void, sae); + HEDLEY_STATIC_CAST(void, imm8); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif - - return simde__m128i_from_private(r_); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcvt_h_s((v4f32)__lsx_vreplgr2vr_w(0), a_.lsx_f32); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); + } #endif + + return simde__m128i_from_private(r_); } +#if defined(SIMDE_X86_F16C_NATIVE) + #define simde_mm_cvtps_ph(a, imm8) _mm_cvtps_ph(a, imm8) +#endif #if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) #define _mm_cvtps_ph(a, sae) simde_mm_cvtps_ph(a, sae) #endif @@ -84,8 +83,15 @@ simde_mm_cvtph_ps(simde__m128i a) { simde__m128i_private a_ = simde__m128i_to_private(a); simde__m128_private r_; - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfcvtl_s_h(a_.lsx_i64); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { @@ -102,39 +108,36 @@ simde_mm_cvtph_ps(simde__m128i a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm256_cvtps_ph(simde__m256 a, const int sae) { - #if defined(SIMDE_X86_F16C_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (sae & SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_NO_EXC: - return _mm256_cvtps_ph(a, SIMDE_MM_FROUND_NO_EXC); - default: - return _mm256_cvtps_ph(a, 0); - } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS - #else - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128i_private r_; +simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { + simde__m256_private a_ = simde__m256_to_private(a); + simde__m128i_private r_; - HEDLEY_STATIC_CAST(void, sae); + HEDLEY_STATIC_CAST(void, imm8); - #if defined(SIMDE_X86_F16C_NATIVE) - return _mm_castps_si128(_mm_movelh_ps( - _mm_castsi128_ps(_mm_cvtps_ph(a_.m128[0], SIMDE_MM_FROUND_NO_EXC)), - _mm_castsi128_ps(_mm_cvtps_ph(a_.m128[1], SIMDE_MM_FROUND_NO_EXC)) - )); - #else - SIMDE_VECTORIZE + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvfcvt_h_s(a_.f256, a_.f256); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + r_.lsx_i64 = simde_mm256_extractf128_si256(a_.i256, 0); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); } - #endif - - return simde__m128i_from_private(r_); #endif + + + return simde__m128i_from_private(r_); } +#if defined(SIMDE_X86_F16C_NATIVE) + #define simde_mm256_cvtps_ph(a, imm8) _mm256_cvtps_ph(a, imm8) +#endif #if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtps_ph(a, sae) simde_mm256_cvtps_ph(a, sae) + #define _mm256_cvtps_ph(a, imm8) simde_mm256_cvtps_ph(a, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -151,10 +154,21 @@ simde_mm256_cvtph_ps(simde__m128i a) { simde__m128i_private a_ = simde__m128i_to_private(a); simde__m256_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = simde_mm256_castsi128_si256(a_.lsx_i64); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.f256 = __lasx_xvfcvtl_s_h(r_.i256); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); + } + #endif return simde__m256_from_private(r_); #endif diff --git a/x86/fma.h b/x86/fma.h index 6ed68d5bf..bb174284b 100644 --- a/x86/fma.h +++ b/x86/fma.h @@ -42,6 +42,8 @@ simde__m128d simde_mm_fmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmadd_d(a, b, c); #else simde__m128d_private a_ = simde__m128d_to_private(a), @@ -78,6 +80,8 @@ simde__m256d simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmadd_d(a, b, c); #else return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c); #endif @@ -92,6 +96,8 @@ simde__m128 simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmadd_s(a, b, c); #else simde__m128_private a_ = simde__m128_to_private(a), @@ -101,7 +107,7 @@ simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); @@ -130,6 +136,8 @@ simde__m256 simde_mm256_fmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmadd_s(a, b, c); #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) simde__m256_private a_ = simde__m256_to_private(a), @@ -156,6 +164,8 @@ simde__m128d simde_mm_fmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) return _mm_fmadd_sd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vextrins_d(a, __lsx_vfmadd_d(a, b, c), 0x00); #else return simde_mm_add_sd(simde_mm_mul_sd(a, b), c); #endif @@ -170,6 +180,8 @@ simde__m128 simde_mm_fmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) return _mm_fmadd_ss(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vextrins_w(a, __lsx_vfmadd_s(a, b, c), 0x00); #else return simde_mm_add_ss(simde_mm_mul_ss(a, b), c); #endif @@ -240,6 +252,8 @@ simde__m128d simde_mm_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmsub_d(a, b, c); #else return simde_mm_sub_pd(simde_mm_mul_pd(a, b), c); #endif @@ -254,6 +268,8 @@ simde__m256d simde_mm256_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmsub_d(a, b, c); #else return simde_mm256_sub_pd(simde_mm256_mul_pd(a, b), c); #endif @@ -268,6 +284,8 @@ simde__m128 simde_mm_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmsub_s(a, b, c); #else return simde_mm_sub_ps(simde_mm_mul_ps(a, b), c); #endif @@ -282,6 +300,8 @@ simde__m256 simde_mm256_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmsub_s(a, b, c); #else return simde_mm256_sub_ps(simde_mm256_mul_ps(a, b), c); #endif @@ -324,6 +344,11 @@ simde__m128d simde_mm_fmsubadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsubadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + a = __lsx_vfmul_d(a, b); + b = __lsx_vfsub_d(a, c); + c = __lsx_vfadd_d(a, c); + return (simde__m128d)__lsx_vextrins_d(c, b, 0x11); #else simde__m128d_private r_, @@ -350,6 +375,11 @@ simde__m256d simde_mm256_fmsubadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsubadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a = __lasx_xvfmul_d(a, b); + b = __lasx_xvfsub_d(a, c); + c = __lasx_xvfadd_d(a, c); + return (simde__m256d)__lasx_xvextrins_d(c, b, 0x11); #else simde__m256d_private r_, @@ -376,6 +406,11 @@ simde__m128 simde_mm_fmsubadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsubadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + a = __lsx_vfmul_s(a, b); + b = __lsx_vfsub_s(a, c); + c = __lsx_vfadd_s(a, c); + return (simde__m128)__lsx_vextrins_w(__lsx_vextrins_w(c, b, 0x11), b, 0x33); #else simde__m128_private r_, @@ -402,6 +437,11 @@ simde__m256 simde_mm256_fmsubadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsubadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a = __lasx_xvfmul_s(a, b); + b = __lasx_xvfsub_s(a, c); + c = __lasx_xvfadd_s(a, c); + return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(c, b, 0x11), b, 0x33); #else simde__m256_private r_, @@ -428,6 +468,8 @@ simde__m128d simde_mm_fnmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_d(c, __lsx_vfmul_d(a, b)); #else simde__m128d_private r_, @@ -457,6 +499,8 @@ simde__m256d simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_d(c, __lasx_xvfmul_d(a, b)); #else simde__m256d_private r_, @@ -464,11 +508,16 @@ simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { b_ = simde__m256d_to_private(b), c_ = simde__m256d_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { + r_.m128d[i] = simde_mm_fnmadd_pd(a_.m128d[i], b_.m128d[i], c_.m128d[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; + } + #endif return simde__m256d_from_private(r_); #endif } @@ -482,6 +531,8 @@ simde__m128 simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_s(c, __lsx_vfmul_s(a, b)); #else simde__m128_private r_, @@ -489,7 +540,7 @@ simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { b_ = simde__m128_to_private(b), c_ = simde__m128_to_private(c); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); @@ -513,6 +564,8 @@ simde__m256 simde_mm256_fnmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_s(c, __lasx_xvfmul_s(a, b)); #else simde__m256_private r_, @@ -584,6 +637,8 @@ simde__m128d simde_mm_fnmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_d((__m128d)__lsx_vreplgr2vr_d(0), __lsx_vfmadd_d(a, b, c)); #else simde__m128d_private r_, @@ -609,6 +664,8 @@ simde__m256d simde_mm256_fnmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_d((__m256d)__lasx_xvreplgr2vr_d(0), __lasx_xvfmadd_d(a, b, c)); #else simde__m256d_private r_, @@ -634,6 +691,8 @@ simde__m128 simde_mm_fnmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(0), __lsx_vfmadd_s(a, b, c)); #else simde__m128_private r_, @@ -659,6 +718,8 @@ simde__m256 simde_mm256_fnmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(0), __lasx_xvfmadd_s(a, b, c)); #else simde__m256_private r_, diff --git a/x86/gfni.h b/x86/gfni.h index d0dd6e046..5982a3409 100644 --- a/x86/gfni.h +++ b/x86/gfni.h @@ -267,7 +267,7 @@ simde_x_mm_gf2p8matrix_multiply_epi64_epi8 (simde__m128i x, simde__m128i A) { for (int i = 0 ; i < 8 ; i++) { #if defined(SIMDE_BUG_CLANG_50932) p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_bperm(HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); + vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); #else p = vec_bperm(a, bit_select); #endif diff --git a/x86/mmx.h b/x86/mmx.h index b46bd9382..e294af8e9 100644 --- a/x86/mmx.h +++ b/x86/mmx.h @@ -1467,18 +1467,17 @@ simde_mm_slli_pi16 (simde__m64 a, int count) { simde__m64_private r_; simde__m64_private a_ = simde__m64_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) + #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) + r_.mmi_i16 = psllh_s(a_.mmi_i16, count); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) if (HEDLEY_UNLIKELY(count > 15)) return simde_mm_setzero_si64(); r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << count; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -2157,10 +2156,10 @@ simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); #else r_.i8[0] = a_.i8[4]; r_.i8[1] = b_.i8[4]; diff --git a/x86/sse.h b/x86/sse.h index f5311c14b..110142ca9 100644 --- a/x86/sse.h +++ b/x86/sse.h @@ -31,15 +31,62 @@ #define SIMDE_X86_SSE_H #include "mmx.h" - -#if defined(_WIN32) && !defined(SIMDE_X86_SSE_NATIVE) && defined(_MSC_VER) - #include -#endif +#include "../simde-f16.h" #if defined(__ARM_ACLE) #include #endif +#ifdef _MSC_VER + #if defined(SIMDE_ARCH_AARCH64) + #include + + typedef enum simde_tag_ARM64INTR_BARRIER_TYPE + { + SIMDE_ARM64_BARRIER_SY = 0xF, + } + SIMDE_ARM64INTR_BARRIER_TYPE; + + HEDLEY_ALWAYS_INLINE + void simde_MemoryBarrier(void) { + __dmb(SIMDE_ARM64_BARRIER_SY); + } + #elif defined(SIMDE_ARCH_ARM) + #include + + typedef enum simde_tag_ARMINTR_BARRIER_TYPE + { + SIMDE_ARM_BARRIER_SY = 0xF, + } + SIMDE_ARMINTR_BARRIER_TYPE; + + HEDLEY_ALWAYS_INLINE + void simde_MemoryBarrier(void) { + __dmb(SIMDE_ARM_BARRIER_SY); + } + #elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_E2K) + #if !defined(SIMDE_X86_SSE_NO_NATIVE) + #include + #endif + + HEDLEY_ALWAYS_INLINE + void simde_MemoryBarrier(void) { + #if defined(SIMDE_X86_SSE_NO_NATIVE) + ((void)0); // intentionally no-op + #elif defined(SIMDE_ARCH_AMD64) + __faststorefence(); + #elif defined(SIMDE_ARCH_IA64) + __mf(); + #else + long Barrier; + __asm { xchg Barrier, eax } + #endif + } + #else + #error "Missing implementation" + #endif +#endif + HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ @@ -58,6 +105,11 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + #endif SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -74,6 +126,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128[1]; SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; #endif + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; SIMDE_ALIGN_TO_16 simde_float32 f32[4]; SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; @@ -121,6 +174,17 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128_private; @@ -132,6 +196,8 @@ typedef union { typedef v128_t simde__m128; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128; +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + typedef v4f32 simde__m128; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef simde_float32 simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #else @@ -215,6 +281,19 @@ simde__m128_to_private(simde__m128 v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128); #endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */ +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2f64, lsx, f64) +#endif /* defined(SIMDE_LOONGARCH_LSX_NATIVE) */ + enum { #if defined(SIMDE_X86_SSE_NATIVE) SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, @@ -228,6 +307,14 @@ enum { SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000 #endif }; +#if defined(_MM_ROUND_MASK) +# define SIMDE_MM_ROUND_MASK _MM_ROUND_MASK +#else +# define SIMDE_MM_ROUND_MASK (0x6000) +#endif +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) + #define _MM_ROUND_MASK SIMDE_MM_ROUND_MASK +#endif #if defined(_MM_FROUND_TO_NEAREST_INT) # define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT @@ -395,7 +482,7 @@ enum { #endif SIMDE_FUNCTION_ATTRIBUTES -unsigned int +uint32_t SIMDE_MM_GET_ROUNDING_MODE(void) { #if defined(SIMDE_X86_SSE_NATIVE) return _MM_GET_ROUNDING_MODE(); @@ -443,7 +530,7 @@ SIMDE_MM_GET_ROUNDING_MODE(void) { SIMDE_FUNCTION_ATTRIBUTES void -SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) { +SIMDE_MM_SET_ROUNDING_MODE(uint32_t a) { #if defined(SIMDE_X86_SSE_NATIVE) _MM_SET_ROUNDING_MODE(a); #elif defined(SIMDE_HAVE_FENV_H) @@ -497,7 +584,7 @@ SIMDE_MM_GET_FLUSH_ZERO_MODE (void) { #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a) + #define _MM_GET_FLUSH_ZERO_MODE(a) SIMDE_MM_GET_FLUSH_ZERO_MODE(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -532,7 +619,7 @@ simde_mm_setcsr (uint32_t a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_setcsr(a); #else - SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a)); + SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(uint32_t, a & SIMDE_MM_ROUND_MASK)); #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -569,13 +656,17 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) r_.neon_f32 = vrndiq_f32(a_.neon_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrne_s(a_.lsx_f32); #elif defined(simde_math_nearbyintf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); } #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); #endif break; @@ -584,13 +675,17 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndnq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrne_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); #elif defined(simde_math_roundevenf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_roundevenf(a_.f32[i]); } #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); #endif break; @@ -599,13 +694,17 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndmq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrm_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_floor(a_.wasm_v128); #elif defined(simde_math_floorf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_floorf(a_.f32[i]); } #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); #endif break; @@ -614,13 +713,17 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndpq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrp_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_ceil(a_.wasm_v128); #elif defined(simde_math_ceilf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_ceilf(a_.f32[i]); } #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); #endif break; @@ -629,18 +732,22 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrz_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_trunc(a_.wasm_v128); #elif defined(simde_math_truncf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_math_truncf(a_.f32[i]); } #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); #endif break; default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_ps()); } return simde__m128_from_private(r_); @@ -667,6 +774,9 @@ simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_flo r_.neon_f32 = vld1q_f32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde_float32 data[4] = { e0, e1, e2, e3 }; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -691,6 +801,10 @@ simde_mm_set_ps1 (simde_float32 a) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) (void) a; return vec_splats(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vldrepl_w(&a, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_splat(a); #else return simde_mm_set_ps(a, a, a, a); #endif @@ -712,15 +826,17 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) m = { ~0U, 0U, 0U, 0U }; r_.altivec_f32 = vec_sel(a_.altivec_f32, b_.altivec_f32, m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, b_.lsx_i64, 0); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); #else r_.f32[0] = b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -738,7 +854,7 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_x_mm_broadcastlow_ps(simde__m128 a) { - /* This function broadcasts the first element in the inpu vector to + /* This function broadcasts the first element in the input vector to * all lanes. It is used to avoid generating spurious exceptions in * *_ss functions since there may be garbage in the upper lanes. */ @@ -753,6 +869,10 @@ simde_x_mm_broadcastlow_ps(simde__m128 a) { r_.neon_f32 = vdupq_laneq_f32(a_.neon_f32, 0); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_splat(a_.altivec_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_w(a_.lsx_i64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_splat(a_.f32[0]); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); #else @@ -783,6 +903,8 @@ simde_mm_add_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 + b_.f32; #else @@ -819,6 +941,8 @@ simde_mm_add_ss (simde__m128 a, simde__m128 b) { float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . r_.neon_f32 = vaddq_f32(a_.neon_f32, value); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32), 0x00); #else r_.f32[0] = a_.f32[0] + b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -848,6 +972,8 @@ simde_mm_and_ps (simde__m128 a, simde__m128 b) { r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 & b_.i32; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) @@ -883,6 +1009,8 @@ simde_mm_andnot_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = ~a_.i32 & b_.i32; #else @@ -916,6 +1044,8 @@ simde_mm_xor_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f ^ b_.i32f; #else @@ -949,6 +1079,8 @@ simde_mm_or_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f | b_.i32f; #else @@ -987,6 +1119,8 @@ simde_x_mm_not_ps(simde__m128 a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = ~a_.i32; #else @@ -1026,6 +1160,8 @@ simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask) { r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, mask_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); #else @@ -1132,6 +1268,10 @@ simde_x_mm_abs_ps(simde__m128 a) { r_.altivec_f32 = vec_abs(a_.altivec_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vreplgr2vr_w(0); + __m128 temp1 = __lsx_vfsub_s((__m128)temp, a_.lsx_f32); + r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, temp1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1160,6 +1300,8 @@ simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.f32 == b_.f32); #else @@ -1191,12 +1333,15 @@ simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1221,6 +1366,8 @@ simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpge(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); #else @@ -1252,12 +1399,15 @@ simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1282,6 +1432,8 @@ simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); #else @@ -1313,12 +1465,15 @@ simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1343,6 +1498,8 @@ simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmple(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); #else @@ -1374,11 +1531,15 @@ simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif @@ -1404,6 +1565,8 @@ simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmplt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); #else @@ -1435,11 +1598,15 @@ simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif @@ -1466,6 +1633,8 @@ simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_nor(r_.altivec_f32, r_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); #else @@ -1497,12 +1666,15 @@ simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1607,6 +1779,9 @@ simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i64 = __lsx_vnor_v(r_.lsx_i64, r_.lsx_i64); #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1649,6 +1824,8 @@ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1679,8 +1856,9 @@ simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - - #if defined(simde_math_isnanf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_s(a_.lsx_f32, a_.lsx_f32), __lsx_vfcmp_cune_s(b_.lsx_f32, b_.lsx_f32)), 0); + #elif defined(simde_math_isnanf) r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1713,6 +1891,10 @@ simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] == b_.f32[0]; #endif @@ -1738,6 +1920,10 @@ simde_mm_comige_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); #else return a_.f32[0] >= b_.f32[0]; #endif @@ -1763,6 +1949,10 @@ simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); #else return a_.f32[0] > b_.f32[0]; #endif @@ -1788,6 +1978,10 @@ simde_mm_comile_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] <= b_.f32[0]; #endif @@ -1813,6 +2007,10 @@ simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] < b_.f32[0]; #endif @@ -1838,6 +2036,10 @@ simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] != b_.f32[0]; #endif @@ -1870,6 +2072,9 @@ simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sign_pos = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_splats(-0.0f)); r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v4f32 sign_pos = {-0.0f, -0.0f, -0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); #elif defined(SIMDE_IEEE754_STORAGE) (void) src_; (void) dest_; @@ -1964,6 +2169,9 @@ simde_mm_cvt_si2ss (simde__m128 a, int32_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + float b_temp = (float)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, __lsx_vldrepl_w(&b_temp, 0), 0); #else r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); r_.i32[1] = a_.i32[1]; @@ -1985,6 +2193,8 @@ simde_mm_cvt_ss2si (simde__m128 a) { return _mm_cvt_ss2si(a); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + return __lsx_vpickve2gr_w(__lsx_vftintrne_w_s(simde__m128_to_lsx_f32(a)), 0); #else simde__m128_private a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); #if !defined(SIMDE_FAST_CONVERSION_RANGE) @@ -2011,6 +2221,8 @@ simde_mm_cvtpi16_ps (simde__m64 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vffint_s_w(__lsx_vsllwil_w_h(__lsx_vldrepl_d(&a_.i16, 0), 0)); #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16); #else @@ -2269,6 +2481,9 @@ simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 b_temp = (simde_float32)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vldrepl_w(&(b_temp), 0), 0); #else r_ = a_; r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); @@ -2296,6 +2511,9 @@ simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 b_temp = (simde_float32)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vldrepl_w(&(b_temp), 0), 0); #else r_ = a_; r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); @@ -2317,6 +2535,10 @@ simde_mm_cvtss_f32 (simde__m128 a) { simde__m128_private a_ = simde__m128_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vgetq_lane_f32(a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 temp; + __lsx_vstelm_w(a_.lsx_f32, &temp, 0, 0); + return temp; #else return a_.f32[0]; #endif @@ -2348,6 +2570,8 @@ simde_mm_cvtss_si64 (simde__m128 a) { simde__m128_private a_ = simde__m128_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(__lsx_vftintrne_l_d(__lsx_vfcvtl_d_s(a_.lsx_f32)), 0); #else return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0])); #endif @@ -2400,6 +2624,8 @@ simde_mm_cvtt_ss2si (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + return __lsx_vpickve2gr_w(__lsx_vftintrz_w_s(a_.lsx_f32), 0); #else simde_float32 v = a_.f32[0]; #if !defined(SIMDE_FAST_CONVERSION_RANGE) @@ -2431,6 +2657,8 @@ simde_mm_cvttss_si64 (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return SIMDE_CONVERT_FTOI(int64_t, __lsx_vpickve2gr_w(__lsx_vftintrz_w_s(a_.lsx_f32), 0)); #else return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); #endif @@ -2454,7 +2682,11 @@ simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a); - #if defined(simde_math_isnanf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128_private b_ = simde__m128_to_private(b); + __m128i temp = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(temp, temp), 0); + #elif defined(simde_math_isnanf) r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -2492,6 +2724,8 @@ simde_mm_div_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_f32 = __lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 / b_.f32; #else @@ -2527,6 +2761,8 @@ simde_mm_div_ss (simde__m128 a, simde__m128 b) { float32_t value = vgetq_lane_f32(simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = a_.f32[0] / b_.f32[0]; SIMDE_VECTORIZE @@ -2596,6 +2832,10 @@ simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.altivec_f32 = vec_vsx_ld(0, mem_addr); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_ld(0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load(mem_addr); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128), sizeof(r_)); #endif @@ -2617,6 +2857,10 @@ simde_mm_load1_ps (simde_float32 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vld1q_dup_f32(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_w(mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load32_splat(mem_addr); #else r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr)); #endif @@ -2640,6 +2884,10 @@ simde_mm_load_ss (simde_float32 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_w(mem_addr, 0), 12); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load32_zero(mem_addr); #else r_.f32[0] = *mem_addr; r_.i32[1] = 0; @@ -2657,7 +2905,7 @@ simde_mm_load_ss (simde_float32 const* mem_addr) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); #else simde__m128_private @@ -2666,6 +2914,10 @@ simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(__lsx_vldrepl_d(mem_addr, 0), a_.lsx_i64, 0); #else simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr); r_.f32[0] = a_.f32[0]; @@ -2707,6 +2959,10 @@ simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcombine_f32(vld1_f32( HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(mem_addr, 0), 0); #else simde__m64_private b_; simde_memcpy(&b_, mem_addr, sizeof(b_)); @@ -2742,6 +2998,8 @@ simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) r_.altivec_f32 = vec_reve(v_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vshuf4i_w(v_.lsx_i64, 0x1b); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0); #else @@ -2772,6 +3030,8 @@ simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.wasm_v128 = wasm_v128_load(mem_addr); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) r_.altivec_f32 = vec_vsx_ld(0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -2857,6 +3117,8 @@ simde_mm_max_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -2918,6 +3180,8 @@ simde_mm_max_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -2982,6 +3246,8 @@ simde_mm_min_ps (simde__m128 a, simde__m128 b) { #else r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); #endif + #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); r_.f32 = @@ -3052,6 +3318,8 @@ simde_mm_min_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32_t value = vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -3077,13 +3345,17 @@ simde_mm_movehl_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_u64 = vzip2q_u64(b_.neon_u64, a_.neon_u64); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a32 = vget_high_f32(a_.neon_f32); float32x2_t b32 = vget_high_f32(b_.neon_f32); r_.neon_f32 = vcombine_f32(b32, a32); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_mergel(b_.altivec_i64, a_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3); #else @@ -3111,15 +3383,17 @@ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a10 = vget_low_f32(a_.neon_f32); float32x2_t b10 = vget_low_f32(b_.neon_f32); r_.neon_f32 = vcombine_f32(a10, b10); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_mergeh(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); #else r_.f32[0] = a_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -3171,34 +3445,23 @@ simde_mm_movemask_pi8 (simde__m64 a) { SIMDE_FUNCTION_ATTRIBUTES int simde_mm_movemask_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) return _mm_movemask_ps(a); #else int r = 0; simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + static const int32_t shift[4] = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31); + return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) // Shift out everything but the sign bits with a 32-bit unsigned shift right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const uint32_t md[4] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3 - }; - - uint32x4_t extended = vreinterpretq_u32_s32(vshrq_n_s32(a_.neon_i32, 31)); - uint32x4_t masked = vandq_u32(vld1q_u32(md), extended); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(masked)); - #else - uint64x2_t t64 = vpaddlq_u32(masked); - return - HEDLEY_STATIC_CAST(int, vgetq_lane_u64(t64, 0)) + - HEDLEY_STATIC_CAST(int, vgetq_lane_u64(t64, 1)); - #endif #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); @@ -3207,6 +3470,11 @@ simde_mm_movemask_ps (simde__m128 a) { SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v2i64 t64 = __lsx_vmskltz_w(a_.lsx_i64); + r = __lsx_vpickve2gr_wu(t64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_bitmask(a_.wasm_v128)); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) { @@ -3240,6 +3508,8 @@ simde_mm_mul_ps (simde__m128 a, simde__m128 b) { r_.f32 = a_.f32 * b_.f32; #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -3269,10 +3539,14 @@ simde_mm_mul_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.f32[0] = a_.f32[0] * b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.f32[0] = a_.f32[0] * b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif @@ -3471,6 +3745,9 @@ simde_mm_prefetch (const void* p, int i) { __prefetch_by_load(p, 0, 1); break; } + #elif HEDLEY_MSVC_VERSION + (void) i; + (void) p; #endif } #if defined(SIMDE_X86_SSE_NATIVE) @@ -3506,6 +3783,9 @@ simde_x_mm_negate_ps(simde__m128 a) { r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_f32 = vec_neg(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v4f32 f32 = {0.0f, 0.0f, 0.0f, 0.0f}; + r_.lsx_f32 = __lsx_vfsub_s(f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_NEGATE) r_.f32 = -a_.f32; #else @@ -3543,6 +3823,8 @@ simde_mm_rcp_ps (simde__m128 a) { r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_re(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrecip_s(a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.f32 = 1.0f / a_.f32; #elif defined(SIMDE_IEEE754_STORAGE) @@ -3584,11 +3866,14 @@ simde_mm_rcp_ss (simde__m128 a) { simde__m128_private r_, a_ = simde__m128_to_private(a); - - r_.f32[0] = 1.0f / a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfrecip_s(a_.lsx_f32), 0); + #else + r_.f32[0] = 1.0f / a_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif @@ -3611,6 +3896,10 @@ simde_mm_rsqrt_ps (simde__m128 a) { r_.neon_f32 = vrsqrteq_f32(a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_rsqrte(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrsqrt_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), wasm_f32x4_sqrt(a_.wasm_v128)); #elif defined(SIMDE_IEEE754_STORAGE) /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf Pages 100 - 103 */ @@ -3673,6 +3962,8 @@ simde_mm_rsqrt_ss (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfrsqrt_s(a_.lsx_f32), 0); #elif defined(SIMDE_IEEE754_STORAGE) { #if SIMDE_ACCURACY_PREFERENCE <= 0 @@ -3793,6 +4084,10 @@ simde_mm_setzero_ps (void) { return vdupq_n_f32(SIMDE_FLOAT32_C(0.0)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_splats(SIMDE_FLOAT32_C(0.0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vreplgr2vr_w(0); #else simde__m128 r; simde_memset(&r, 0, sizeof(r)); @@ -3851,7 +4146,7 @@ simde_mm_sfence (void) { atomic_thread_fence(memory_order_seq_cst); #endif #elif defined(_MSC_VER) - MemoryBarrier(); + simde_MemoryBarrier(); #elif HEDLEY_HAS_EXTENSION(c_atomic) __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) @@ -3874,11 +4169,11 @@ simde_mm_sfence (void) { # define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \ - const simde__m64_private simde__tmp_a_ = simde__m64_to_private(a); \ + const simde__m64_private simde_tmp_a_ = simde__m64_to_private(a); \ simde__m64_from_private((simde__m64_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 8, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ (((imm8) ) & 3), \ (((imm8) >> 2) & 3), \ (((imm8) >> 4) & 3), \ @@ -3931,21 +4226,23 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) } #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) # define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_ps(a, b, imm8) (simde__m128)(__lsx_vpermi_w(simde__m128_to_private(b).lsx_i64, simde__m128_to_private(a).i64, imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .f32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - simde__m128_to_private(a).f32, \ - simde__m128_to_private(b).f32, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) + simde__m128_from_private((simde__m128_private) { .wasm_v128 = \ + wasm_i32x4_shuffle( \ + simde__m128_to_private(a).wasm_v128, \ + simde__m128_to_private(b).wasm_v128, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shuffle_ps(a, b, imm8) \ (__extension__({ \ - float32x4_t simde_mm_shuffle_ps_a_ = simde__m128i_to_neon_f32(a); \ - float32x4_t simde_mm_shuffle_ps_b_ = simde__m128i_to_neon_f32(b); \ + float32x4_t simde_mm_shuffle_ps_a_ = simde__m128_to_neon_f32(a); \ + float32x4_t simde_mm_shuffle_ps_b_ = simde__m128_to_neon_f32(b); \ float32x4_t simde_mm_shuffle_ps_r_; \ \ simde_mm_shuffle_ps_r_ = vmovq_n_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, (imm8) & (0x3))); \ @@ -3953,6 +4250,16 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_ps_r_, 2); \ vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_ps_r_, 3); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ + simde__m128_from_private((simde__m128_private) { .f32 = \ + SIMDE_SHUFFLE_VECTOR_(32, 16, \ + simde__m128_to_private(a).f32, \ + simde__m128_to_private(b).f32, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #endif #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) # define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8) @@ -3980,6 +4287,8 @@ simde_mm_sqrt_ps (simde__m128 a) { r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_sqrt(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfsqrt_s(a_.lsx_f32); #elif defined(simde_math_sqrt) SIMDE_VECTORIZE for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) { @@ -4014,6 +4323,8 @@ simde_mm_sqrt_ss (simde__m128 a) { float32_t value = vgetq_lane_f32(simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfsqrt_s(a_.lsx_f32), 0); #elif defined(simde_math_sqrtf) r_.f32[0] = simde_math_sqrtf(a_.f32[0]); r_.f32[1] = a_.f32[1]; @@ -4044,6 +4355,8 @@ simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) { vec_st(a_.altivec_f32, 0, mem_addr); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(mem_addr, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_f32, mem_addr, 0); #else simde_memcpy(mem_addr, &a_, sizeof(a)); #endif @@ -4069,6 +4382,8 @@ simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) { wasm_v128_store(mem_addr_, wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0, 0)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vreplvei_w(a_.lsx_f32, 0), mem_addr_, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) simde__m128_private tmp_; tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); @@ -4097,6 +4412,10 @@ simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_lane_f32(mem_addr, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_w(a_.lsx_f32, mem_addr, 0, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store32_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); #else *mem_addr = a_.f32[0]; #endif @@ -4116,6 +4435,10 @@ simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, HEDLEY_REINTERPRET_CAST(void*, mem_addr), 0, 1); #else simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1])); #endif @@ -4130,6 +4453,12 @@ void simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + simde__m128_private a_ = simde__m128_to_private(a); + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128_private a_ = simde__m128_to_private(a); + __lsx_vstelm_d(a_.lsx_i64, HEDLEY_REINTERPRET_CAST(void*, mem_addr), 0, 0); #else simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr); simde__m128_private a_ = simde__m128_to_private(a); @@ -4159,6 +4488,8 @@ simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4_t tmp = vrev64q_f32(a_.neon_f32); vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vshuf4i_w(a_.lsx_f32, 0x1b), mem_addr, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0); simde_mm_store_ps(mem_addr, simde__m128_from_private(a_)); @@ -4186,6 +4517,10 @@ simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) { vst1q_f32(mem_addr, a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) vec_vsx_st(a_.altivec_f32, 0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_f32, mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store(mem_addr, a_.wasm_v128); #else simde_memcpy(mem_addr, &a_, sizeof(a_)); #endif @@ -4212,6 +4547,8 @@ simde_mm_sub_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 - b_.f32; #else @@ -4242,12 +4579,14 @@ simde_mm_sub_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] - b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.f32[0] = a_.f32[0] - b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif } @@ -4273,6 +4612,10 @@ simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4307,6 +4650,10 @@ simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4341,6 +4688,10 @@ simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4375,6 +4726,10 @@ simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4409,6 +4764,10 @@ simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4443,6 +4802,10 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4470,11 +4833,6 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { # endif #endif -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { @@ -4493,6 +4851,10 @@ simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { float32x2_t b1 = vget_high_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7); #else @@ -4524,13 +4886,17 @@ simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) { r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a1 = vget_low_f32(a_.neon_f32); float32x2_t b1 = vget_low_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); #else r_.f32[0] = a_.f32[0]; r_.f32[1] = b_.f32[0]; @@ -4550,16 +4916,19 @@ void simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \ + defined(SIMDE_VECTOR_SUBSCRIPT)) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private a_ = simde__m64_to_private(a); + vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64); #else simde__m64_private* dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr), a_ = simde__m64_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - dest->i64[0] = vget_lane_s64(a_.neon_i64, 0); - #else - dest->i64[0] = a_.i64[0]; - #endif + dest->i64[0] = a_.i64[0]; #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -4571,9 +4940,11 @@ void simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m128_private a_ = simde__m128_to_private(a); - __builtin_nontemporal_store(a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32)*, mem_addr)); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr)); #else simde_mm_store_ps(mem_addr, a); #endif diff --git a/x86/sse2.h b/x86/sse2.h index d4bd1950e..c3fc42f1b 100644 --- a/x86/sse2.h +++ b/x86/sse2.h @@ -33,6 +33,7 @@ #define SIMDE_X86_SSE2_H #include "sse.h" +#include "../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -52,6 +53,11 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + #endif SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -70,6 +76,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128[1]; SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; #endif + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; SIMDE_ALIGN_TO_16 simde_float32 f32[4]; SIMDE_ALIGN_TO_16 simde_float64 f64[2]; @@ -132,6 +139,17 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128i_private; @@ -167,7 +185,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) SIMDE_ALIGN_TO_16 __m128d n; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_TO_16 int8x16_t neon_i8; @@ -216,10 +234,21 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128d_private; -#if defined(SIMDE_X86_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) typedef __m128i simde__m128i; typedef __m128d simde__m128d; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -241,6 +270,9 @@ typedef union { #else typedef simde__m128d_private simde__m128d; #endif +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + typedef v2i64 simde__m128i; + typedef v2f64 simde__m128d; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -249,7 +281,7 @@ typedef union { typedef simde__m128d_private simde__m128d; #endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) typedef simde__m128i __m128i; typedef simde__m128d __m128d; #endif @@ -321,6 +353,17 @@ simde__m128d_to_private(simde__m128d v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) #endif +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -368,6 +411,17 @@ simde__m128d_to_private(simde__m128d v) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128); SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128); +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ SIMDE_FUNCTION_ATTRIBUTES @@ -383,6 +437,9 @@ simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.neon_f64 = vld1q_f64(data); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.f64[0] = e0; r_.f64[1] = e1; @@ -409,6 +466,8 @@ simde_mm_set1_pd (simde_float64 a) { r_.neon_f64 = vdupq_n_f64(a); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_d(&a, 0); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -442,6 +501,11 @@ simde_x_mm_abs_pd(simde__m128d a) { r_.neon_f64 = vabsq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_abs(a_.altivec_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); + r_.lsx_i64 = __lsx_vand_v(__lsx_vldrepl_d(&u64_, 0), a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -472,6 +536,8 @@ simde_x_mm_not_pd(simde__m128d a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f; #else @@ -509,6 +575,8 @@ simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) { r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_u64) #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -537,6 +605,8 @@ simde_mm_add_epi8 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 + b_.i8; #else @@ -570,6 +640,8 @@ simde_mm_add_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 + b_.i16; #else @@ -603,6 +675,8 @@ simde_mm_add_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 + b_.i32; #else @@ -634,6 +708,8 @@ simde_mm_add_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -665,12 +741,12 @@ simde_mm_add_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfadd_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f64 = a_.f64 + b_.f64; #else @@ -708,6 +784,8 @@ simde_mm_move_sd (simde__m128d a, simde__m128d b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(b_.lsx_i64, a_.lsx_i64, 0b00010001); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1); #else @@ -740,6 +818,10 @@ simde_x_mm_broadcastlow_pd(simde__m128d a) { r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_splat(a_.altivec_f64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_splat(a_.f64[0]); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_d(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); #else @@ -767,10 +849,12 @@ simde_mm_add_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] + b_.f64[0]; - r_.f64[1] = a_.f64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfadd_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.f64[0] = a_.f64[0] + b_.f64[0]; + r_.f64[1] = a_.f64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -819,6 +903,8 @@ simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -850,6 +936,8 @@ simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -881,6 +969,8 @@ simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -912,6 +1002,8 @@ simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -943,6 +1035,8 @@ simde_mm_and_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f & b_.i32f; #else @@ -974,6 +1068,10 @@ simde_mm_and_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f & b_.i32f; #else @@ -1009,6 +1107,8 @@ simde_mm_andnot_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f & b_.i32f; #else @@ -1040,6 +1140,10 @@ simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f & b_.i32f; #else @@ -1073,6 +1177,8 @@ simde_mm_xor_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -1104,6 +1210,8 @@ simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vavgr_bu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) uint16_t wa SIMDE_VECTOR(32); uint16_t wb SIMDE_VECTOR(32); @@ -1143,6 +1251,8 @@ simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vavgr_hu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) uint32_t wa SIMDE_VECTOR(32); uint32_t wb SIMDE_VECTOR(32); @@ -1179,6 +1289,8 @@ simde_mm_setzero_si128 (void) { r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_w(0); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 }; #else @@ -1230,21 +1342,45 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_bslli_si128(a, imm8) \ + (((imm8)<=0) ? (simde__m128i)(a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsll_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bslli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_bslli_si128(a, imm8) __extension__ ({ \ + simde__m128i_from_wasm_v128( \ + wasm_i8x16_shuffle(wasm_i32x4_splat(INT32_C(0)), \ + simde__m128i_to_wasm_v128((a)), \ + ((imm8)&0xF0) ? 0 : 16 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 17 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 18 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 19 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 20 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 21 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 22 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 23 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 24 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 25 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 26 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 27 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 28 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 29 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 30 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 31 - ((imm8)&0xF))); }) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde__tmp_r_; \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_; \ if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ } else { \ - simde__tmp_r_.i8 = \ + simde_tmp_r_.i8 = \ SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde__tmp_z_.i8, \ - (simde__tmp_a_).i8, \ + simde_tmp_z_.i8, \ + (simde_tmp_a_).i8, \ HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \ @@ -1262,7 +1398,7 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \ } \ - simde__m128i_from_private(simde__tmp_r_); })) + simde__m128i_from_private(simde_tmp_r_); })) #endif #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -1304,21 +1440,54 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_bsrli_si128(a, imm8) \ + (((imm8)<=0) ? (simde__m128i)(a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsrl_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bsrli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ + if (HEDLEY_UNLIKELY(imm8 > 15)) { \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + } else { \ + simde_tmp_r_.wasm_v128 = \ + wasm_i8x16_shuffle( \ + simde_tmp_z_.wasm_v128, \ + simde_tmp_a_.wasm_v128, \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ + } \ + simde__m128i_from_private(simde_tmp_r_); })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ } else { \ - simde__tmp_r_.i8 = \ + simde_tmp_r_.i8 = \ SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde__tmp_z_.i8, \ - (simde__tmp_a_).i8, \ + simde_tmp_z_.i8, \ + (simde_tmp_a_).i8, \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ @@ -1336,7 +1505,7 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ } \ - simde__m128i_from_private(simde__tmp_r_); })) + simde__m128i_from_private(simde_tmp_r_); })) #endif #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -1354,7 +1523,7 @@ simde_mm_clflush (void const* p) { #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_clflush(a, b) simde_mm_clflush() + #define _mm_clflush(p) simde_mm_clflush(p) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1370,6 +1539,8 @@ simde_mm_comieq_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] == b_.f64[0]; #endif @@ -1392,6 +1563,8 @@ simde_mm_comige_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] >= b_.f64[0]; #endif @@ -1414,6 +1587,8 @@ simde_mm_comigt_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] > b_.f64[0]; #endif @@ -1436,6 +1611,8 @@ simde_mm_comile_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); #else return a_.f64[0] <= b_.f64[0]; #endif @@ -1458,6 +1635,8 @@ simde_mm_comilt_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); #else return a_.f64[0] < b_.f64[0]; #endif @@ -1480,6 +1659,8 @@ simde_mm_comineq_sd (simde__m128d a, simde__m128d b) { return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] != b_.f64[0]; #endif @@ -1513,6 +1694,9 @@ simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) { #else r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64); #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v2f64 sign_pos = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); #elif defined(simde_math_copysign) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -1539,6 +1723,8 @@ simde_mm_castpd_ps (simde__m128d a) { return _mm_castpd_ps(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f32_f64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)a; #else simde__m128 r; simde_memcpy(&r, &a, sizeof(a)); @@ -1556,6 +1742,8 @@ simde_mm_castpd_si128 (simde__m128d a) { return _mm_castpd_si128(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_s64_f64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)a; #else simde__m128i r; simde_memcpy(&r, &a, sizeof(a)); @@ -1573,6 +1761,8 @@ simde_mm_castps_pd (simde__m128 a) { return _mm_castps_pd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f64_f32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)a; #else simde__m128d r; simde_memcpy(&r, &a, sizeof(a)); @@ -1590,6 +1780,8 @@ simde_mm_castps_si128 (simde__m128 a) { return _mm_castps_si128(a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)a; #else simde__m128i r; simde_memcpy(&r, &a, sizeof(a)); @@ -1607,6 +1799,8 @@ simde_mm_castsi128_pd (simde__m128i a) { return _mm_castsi128_pd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f64_s64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)a; #else simde__m128d r; simde_memcpy(&r, &a, sizeof(a)); @@ -1626,6 +1820,8 @@ simde_mm_castsi128_ps (simde__m128i a) { return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return HEDLEY_REINTERPRET_CAST(__m128, a); #else simde__m128 r; simde_memcpy(&r, &a, sizeof(a)); @@ -1653,6 +1849,8 @@ simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); #else @@ -1686,6 +1884,8 @@ simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = (a_.i16 == b_.i16); #else @@ -1719,6 +1919,8 @@ simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); #else @@ -1754,6 +1956,8 @@ simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_MIPS_MSA_NATIVE) r_.msa_i32 = __msa_addv_w(a_.msa_i32, b_.msa_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); #else @@ -1785,9 +1989,12 @@ simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -1810,6 +2017,8 @@ simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); #else @@ -1840,11 +2049,12 @@ simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -1869,6 +2079,8 @@ simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); #else @@ -1902,6 +2114,8 @@ simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); #else @@ -1935,6 +2149,8 @@ simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); #else @@ -1968,6 +2184,8 @@ simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); #else @@ -1999,9 +2217,12 @@ simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2028,6 +2249,8 @@ simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2056,10 +2279,12 @@ simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2084,6 +2309,8 @@ simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8)); #else @@ -2117,6 +2344,8 @@ simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16)); #else @@ -2150,6 +2379,8 @@ simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32)); #else @@ -2185,6 +2416,8 @@ simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2213,10 +2446,12 @@ simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2243,6 +2478,8 @@ simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2272,9 +2509,12 @@ simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2404,6 +2644,12 @@ simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); r_.neon_u64 = vandq_u64(ceqaa, ceqbb); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(wasm_f64x2_eq(a_.wasm_v128, a_.wasm_v128), + wasm_f64x2_eq(b_.wasm_v128, b_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(__lsx_vfcmp_ceq_d(a_.lsx_f64, a_.lsx_f64), + __lsx_vfcmp_ceq_d(b_.lsx_f64, b_.lsx_f64)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2429,6 +2675,11 @@ simde_mm_cvtsd_f64 (simde__m128d a) { simde__m128d_private a_ = simde__m128d_to_private(a); #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return HEDLEY_STATIC_CAST(simde_float64, wasm_f64x2_extract_lane(a_.wasm_v128, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, &a_.f64, 0, 0); + return a_.f64[0]; #else return a_.f64[0]; #endif @@ -2453,7 +2704,10 @@ simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_isnan) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vand_v(__lsx_vfcmp_ceq_d(a_.lsx_f64, + a_.lsx_f64), __lsx_vfcmp_ceq_d(b_.lsx_f64, b_.lsx_f64)), 0); + #elif defined(simde_math_isnan) r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else @@ -2482,6 +2736,12 @@ simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb)))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_or(wasm_f64x2_ne(a_.wasm_v128, a_.wasm_v128), + wasm_f64x2_ne(b_.wasm_v128, b_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(__lsx_vfcmp_cune_d(a_.lsx_f64, a_.lsx_f64), + __lsx_vfcmp_cune_d(b_.lsx_f64, b_.lsx_f64)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2513,7 +2773,9 @@ simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_isnan) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_d(a_.lsx_f64, a_.lsx_f64), __lsx_vfcmp_cune_d(b_.lsx_f64, b_.lsx_f64)), 0); + #elif defined(simde_math_isnan) r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else @@ -2536,7 +2798,11 @@ simde_mm_cvtepi32_pd (simde__m128i a) { simde__m128d_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_convert_low_i32x4(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vffintl_d_w(a_.lsx_i64); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); #else SIMDE_VECTORIZE @@ -2572,6 +2838,8 @@ simde_mm_cvtepi32_ps (simde__m128i a) { #endif r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0); HEDLEY_DIAGNOSTIC_POP + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vffint_s_w(a_.lsx_i64); #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); #else @@ -2623,9 +2891,13 @@ simde_mm_cvtpd_epi32 (simde__m128d a) { #else simde__m128i_private r_; - r_.m64[0] = simde_mm_cvtpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vftintrne_w_d(zero_f64, simde__m128d_to_private(a).lsx_f64); + #else + r_.m64[0] = simde_mm_cvtpd_pi32(a); + r_.m64[1] = simde_mm_setzero_si64(); + #endif return simde__m128i_from_private(r_); #endif } @@ -2648,6 +2920,9 @@ simde_mm_cvtpd_ps (simde__m128d a) { r_.altivec_f32 = vec_float2(a_.altivec_f64, vec_splats(0.0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_demote_f64x2_zero(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_f32 = __lsx_vfcvt_s_d(zero_f64, a_.lsx_f64); #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && HEDLEY_HAS_BUILTIN(__builtin_convertvector) float __attribute__((__vector_size__(8))) z = { 0.0f, 0.0f }; r_.f32 = @@ -2716,6 +2991,9 @@ simde_mm_cvtps_epi32 (simde__m128 a) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) a_ = simde__m128_to_private(a); r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) + a_ = simde__m128_to_private(a); + r_.lsx_i32 = __lsx_vftintrne_w_s(a_.lsx_f32); #else a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE @@ -2746,10 +3024,14 @@ simde_mm_cvtps_pd (simde__m128 a) { simde__m128d_private r_; simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_promote_low_f32x4(a_.wasm_v128); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfcvtl_d_s(a_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2769,6 +3051,9 @@ int32_t simde_mm_cvtsd_si32 (simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_cvtsd_si32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_w(__lsx_vftintrne_w_d(a_.lsx_f64, a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); @@ -2796,7 +3081,11 @@ simde_mm_cvtsd_si64 (simde__m128d a) { #endif #else simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(__lsx_vftintrne_l_d(a_.lsx_f64), 0); + #else + return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); + #endif #endif } #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) @@ -2818,6 +3107,8 @@ simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcvt_s_d(b_.lsx_f64, b_.lsx_f64), 0); #else r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]); @@ -2848,6 +3139,8 @@ simde_x_mm_cvtsi128_si16 (simde__m128i a) { (void) a_; #endif return vec_extract(a_.altivec_i16, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_h(a_.lsx_i64, 0); #else return a_.i16[0]; #endif @@ -2871,6 +3164,8 @@ simde_mm_cvtsi128_si32 (simde__m128i a) { (void) a_; #endif return vec_extract(a_.altivec_i32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_w(a_.lsx_i64, 0); #else return a_.i32[0]; #endif @@ -2895,6 +3190,8 @@ simde_mm_cvtsi128_si64 (simde__m128i a) { return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vgetq_lane_s64(a_.neon_i64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(a_.lsx_i64, 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0)); #endif @@ -2918,6 +3215,9 @@ simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float64 b_float64 = (simde_float64)b; + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(&(b_float64), 0), 0); #else r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); r_.i64[1] = a_.i64[1]; @@ -2939,6 +3239,8 @@ simde_x_mm_cvtsi16_si128 (int16_t a) { r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_h(__lsx_vreplgr2vr_h(0), a, 0); #else r_.i16[0] = a; r_.i16[1] = 0; @@ -2965,6 +3267,8 @@ simde_mm_cvtsi32_si128 (int32_t a) { r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_w(__lsx_vreplgr2vr_w(0), a, 0); #else r_.i32[0] = a; r_.i32[1] = 0; @@ -2995,6 +3299,9 @@ simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float64 b_float64 = (simde_float64)b; + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(&(b_float64), 0), 0); #else r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); r_.f64[1] = a_.f64[1]; @@ -3025,6 +3332,8 @@ simde_mm_cvtsi64_si128 (int64_t a) { r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(a, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0), a, 0); #else r_.i64[0] = a; r_.i64[1] = 0; @@ -3052,8 +3361,11 @@ simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) { a_ = simde__m128d_to_private(a); simde__m128_private b_ = simde__m128_to_private(b); - a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcvtl_d_s(b_.lsx_f32), 0); + #else + a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); + #endif return simde__m128d_from_private(a_); #endif } @@ -3099,9 +3411,13 @@ simde_mm_cvttpd_epi32 (simde__m128d a) { #else simde__m128i_private r_; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); + #else r_.m64[0] = simde_mm_cvttpd_pi32(a); r_.m64[1] = simde_mm_setzero_si64(); - + #endif return simde__m128i_from_private(r_); #endif } @@ -3156,7 +3472,26 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif - #elif defined(SIMDE_CONVERT_VECTOR_) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vftintrz_w_s(a_.lsx_f32); + #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) + #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) + simde_float32 f1 = 2147483648.0f; + __m128i valid_input = + __lsx_vand_v( + __lsx_vfcmp_clt_s(a_.lsx_f32, (__m128)__lsx_vldrepl_w(&f1, 0)), + __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32) + ); + #elif !defined(SIMDE_FAST_CONVERSION_RANGE) + simde_float32 f1 = 2147483648.0f; + __m128i valid_input = __lsx_vfcmp_clt_s(a_.lsx_f32, (__m128)__lsx_vldrepl_w(&f1, 0)); + #elif !defined(SIMDE_FAST_NANS) + __m128i valid_input = __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32); + #endif + + r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), temp, valid_input); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) @@ -3199,6 +3534,9 @@ int32_t simde_mm_cvttsd_si32 (simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_cvttsd_si32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_w(__lsx_vftintrz_w_d(a_.lsx_f64, a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); simde_float64 v = a_.f64[0]; @@ -3223,6 +3561,9 @@ simde_mm_cvttsd_si64 (simde__m128d a) { #else return _mm_cvttsd_si64x(a); #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_d(__lsx_vftintrz_l_d(a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); @@ -3251,6 +3592,8 @@ simde_mm_div_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfdiv_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -3283,6 +3626,9 @@ simde_mm_div_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d temp = __lsx_vfdiv_d(a_.lsx_f64, b_.lsx_f64); + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)temp, 0); #else r_.f64[0] = a_.f64[0] / b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -3318,6 +3664,10 @@ simde_mm_extract_epi16 (simde__m128i a, const int imm8) #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, wasm_u16x8_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, __lsx_vpickve2gr_hu(simde__m128i_to_private(a).lsx_i64, imm8)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) @@ -3335,6 +3685,10 @@ simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8) #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_insert_epi16(a, i, imm8) wasm_i16x8_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7, (i)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_lsx_i16((v8i16)__lsx_vinsgr2vr_h(simde__m128i_to_private(a).lsx_i64, i, imm8)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) @@ -3352,6 +3706,10 @@ simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { r_.neon_f64 = vld1q_f64(mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_)); #endif @@ -3372,6 +3730,8 @@ simde_mm_load1_pd (simde_float64 const* mem_addr) { return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_v128_load64_splat(mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vldrepl_d(mem_addr, 0); #else return simde_mm_set1_pd(*mem_addr); #endif @@ -3392,6 +3752,10 @@ simde_mm_load_sd (simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_zero(HEDLEY_REINTERPRET_CAST(const void*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(__lsx_vreplgr2vr_d(0), __lsx_vldrepl_d(mem_addr, 0), 0); #else r_.f64[0] = *mem_addr; r_.u64[1] = UINT64_C(0); @@ -3409,13 +3773,15 @@ simde__m128i simde_mm_load_si128 (simde__m128i const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)__lsx_vld(mem_addr, 0); #else simde__m128i_private r_; #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i)); #endif @@ -3427,6 +3793,11 @@ simde_mm_load_si128 (simde__m128i const* mem_addr) { #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { @@ -3439,6 +3810,10 @@ simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(__lsx_vldrepl_d(mem_addr, 0), a_.lsx_i64); #else simde_float64 t; @@ -3454,6 +3829,10 @@ simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { @@ -3462,12 +3841,13 @@ simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { #else simde__m128i_private r_; - int64_t value; - simde_memcpy(&value, mem_addr, sizeof(value)); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vldrepl_d(mem_addr, 0), 0, 1); #else + int64_t value; + simde_memcpy(&value, mem_addr, sizeof(value)); r_.i64[0] = value; r_.i64[1] = 0; #endif @@ -3492,6 +3872,10 @@ simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcombine_f64(vld1_f64( HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, __lsx_vldrepl_d(mem_addr, 0)); #else r_.f64[0] = *mem_addr; r_.u64[1] = a_.u64[1]; @@ -3522,6 +3906,9 @@ simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t tmp = wasm_v128_load(mem_addr); r_.wasm_v128 = wasm_i64x2_shuffle(tmp, tmp, 1, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vld(mem_addr, 0); + r_.lsx_i64 = __lsx_vshuf4i_d(temp, temp, 0b0001); #else r_.f64[0] = mem_addr[1]; r_.f64[1] = mem_addr[0]; @@ -3541,6 +3928,8 @@ simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { return _mm_loadu_pd(mem_addr); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld1q_f64(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vld(mem_addr, 0); #else simde__m128d_private r_; @@ -3553,18 +3942,23 @@ simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi8(mem_addr) _mm_loadu_epi8(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi8(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3572,24 +3966,30 @@ simde_mm_loadu_epi8(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi8 #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi16(mem_addr) _mm_loadu_epi16(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi16(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3597,24 +3997,29 @@ simde_mm_loadu_epi16(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi16 #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi32(mem_addr) _mm_loadu_epi32(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi32(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr), 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3622,24 +4027,30 @@ simde_mm_loadu_epi32(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi32 #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi64(mem_addr) _mm_loadu_epi64(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi64(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3647,6 +4058,7 @@ simde_mm_loadu_epi64(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi64 @@ -3671,6 +4083,8 @@ simde_mm_loadu_si128 (void const* mem_addr) { HEDLEY_DIAGNOSTIC_POP #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3707,6 +4121,8 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, vec_splats(0)); #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_mule(a_.altivec_i16, b_.altivec_i16) + vec_mulo(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_dot_i16x8(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int32_t SIMDE_VECTOR(32) a32, b32, p32; SIMDE_CONVERT_VECTOR_(a32, a_.i16); @@ -3715,6 +4131,9 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.i32 = __builtin_shufflevector(p32, p32, 0, 2, 4, 6) + __builtin_shufflevector(p32, p32, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vmaddwod_w_h(temp_ev, a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { @@ -3739,11 +4158,17 @@ simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HED a_ = simde__m128i_to_private(a), mask_ = simde__m128i_to_private(mask); - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - if (mask_.u8[i] & 0x80) { - mem_addr[i] = a_.i8[i]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vld(mem_addr, 0); + __m128i temp1 = __lsx_vbitsel_v(temp, a_.lsx_i64, __lsx_vslti_b(mask_.lsx_i64, 0)); + __lsx_vst(temp1, mem_addr, 0); + #else + for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { + if (mask_.u8[i] & 0x80) { + mem_addr[i] = a_.i8[i]; + } } - } + #endif #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -3790,6 +4215,10 @@ simde_mm_movemask_epi8 (simde__m128i a) { #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG) static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(int32_t, wasm_i8x16_bitmask(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = __lsx_vpickve2gr_w(__lsx_vmskltz_b(a_.lsx_i64), 0); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { @@ -3829,6 +4258,10 @@ simde_mm_movemask_pd (simde__m128d a) { SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_bitmask(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = __lsx_vpickve2gr_w(__lsx_vmskltz_d(a_.lsx_i64), 0); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { @@ -3854,6 +4287,8 @@ simde_mm_movepi64_pi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_i64 = vget_low_s64(a_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.i64[0] = __lsx_vpickve2gr_d(a_.lsx_i64, 0); #else r_.i64[0] = a_.i64[0]; #endif @@ -3876,6 +4311,8 @@ simde_mm_movpi64_epi64 (simde__m64 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0), a_.i64[0], 0); #else r_.i64[0] = a_.i64[0]; r_.i64[1] = 0; @@ -3905,6 +4342,8 @@ simde_mm_min_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -3936,6 +4375,8 @@ simde_mm_min_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -3967,6 +4408,8 @@ simde_mm_min_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -3999,6 +4442,8 @@ simde_mm_min_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4028,6 +4473,8 @@ simde_mm_max_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4059,6 +4506,8 @@ simde_mm_max_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -4090,6 +4539,8 @@ simde_mm_max_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4122,6 +4573,8 @@ simde_mm_max_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4146,6 +4599,10 @@ simde_mm_move_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, wasm_i64x2_const(0, 0), 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(__lsx_vreplgr2vr_d(0), a_.lsx_i64); #else r_.i64[0] = a_.i64[0]; r_.i64[1] = 0; @@ -4173,6 +4630,12 @@ simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { uint32x2_t a_lo = vmovn_u64(a_.neon_u64); uint32x2_t b_lo = vmovn_u64(b_.neon_u64); r_.neon_u64 = vmull_u32(a_lo, b_lo); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u64x2_extmul_low_u32x4( + wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 2, 0, 2), + wasm_i32x4_shuffle(b_.wasm_v128, b_.wasm_v128, 0, 2, 0, 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmulwev_d_wu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(a_.u32) z = { 0, }; a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6); @@ -4201,7 +4664,11 @@ simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_d(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 * b_.i64; #else SIMDE_VECTORIZE @@ -4221,7 +4688,9 @@ simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) { a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmod_d(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) r_.i64 = a_.i64 % b_.i64; #else SIMDE_VECTORIZE @@ -4250,6 +4719,8 @@ simde_mm_mul_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4282,6 +4753,8 @@ simde_mm_mul_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = a_.f64[0] * b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4343,6 +4816,12 @@ simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); r_.neon_u16 = rv.val[1]; #endif + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + const v128_t lo = wasm_i32x4_extmul_low_i16x8(a_.wasm_v128, b_.wasm_v128); + const v128_t hi = wasm_i32x4_extmul_high_i16x8(a_.wasm_v128, b_.wasm_v128); + r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmuh_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4382,6 +4861,12 @@ simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); r_.neon_u16 = neon_r.val[1]; #endif + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + const v128_t lo = wasm_u32x4_extmul_low_u16x8(a_.wasm_v128, b_.wasm_v128); + const v128_t hi = wasm_u32x4_extmul_high_u16x8(a_.wasm_v128, b_.wasm_v128); + r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmuh_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -4413,6 +4898,10 @@ simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { (void) a_; (void) b_; r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4444,6 +4933,8 @@ simde_mm_or_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -4473,6 +4964,10 @@ simde_mm_or_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f | b_.i32f; #else @@ -4508,6 +5003,8 @@ simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = vec_packs(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_b_h(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int16_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const int16_t SIMDE_VECTOR(32) min = { INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN }; @@ -4557,6 +5054,8 @@ simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) { r_.sse_m128i = _mm_packs_epi32(a_.sse_m128i, b_.sse_m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_h_w(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int32_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); const int32_t SIMDE_VECTOR(32) min = { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }; @@ -4612,6 +5111,8 @@ simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_bu_h(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) int16_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -4639,6 +5140,30 @@ void simde_mm_pause (void) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_pause(); + #elif defined(SIMDE_ARCH_X86) + #if defined(_MSC_VER) + __asm pause; + #else + __asm__ __volatile__("pause"); + #endif + #elif defined(SIMDE_ARCH_ARM_NEON) + #if defined(_MSC_VER) + __isb(SIMDE_ARM64_BARRIER_SY); + #else + __asm__ __volatile__("isb\n"); + #endif + #elif defined(SIMDE_ARCH_POWER) + __asm__ __volatile__ ("or 27,27,27" ::: "memory"); + #elif defined(SIMDE_ARCH_WASM) + __asm__ __volatile__ ("nop"); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __asm__ __volatile ("dbar 0"); + #elif defined(HEDLEY_GCC_VERSION) + #if defined(SIMDE_ARCH_RISCV32) || defined(SIMDE_ARCH_RISCV64) + __builtin_riscv_pause(); + #else + __asm__ __volatile__ ("nop" ::: "memory"); + #endif #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -4661,6 +5186,19 @@ simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) { r_.neon_u64 = vcombine_u64( vpaddl_u32(vpaddl_u16(vget_low_u16(t))), vpaddl_u32(vpaddl_u16(vget_high_u16(t)))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t tmp = wasm_v128_or(wasm_u8x16_sub_sat(a_.wasm_v128, b_.wasm_v128), + wasm_u8x16_sub_sat(b_.wasm_v128, a_.wasm_v128)); + tmp = wasm_i16x8_add(wasm_u16x8_shr(tmp, 8), + wasm_v128_and(tmp, wasm_i16x8_splat(0x00FF))); + tmp = wasm_i16x8_add(tmp, wasm_i32x4_shl(tmp, 16)); + tmp = wasm_i16x8_add(tmp, wasm_i64x2_shl(tmp, 32)); + r_.wasm_v128 = wasm_u64x2_shr(tmp, 48); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vabsd_bu(a_.lsx_i64, b_.lsx_i64); + temp = __lsx_vhaddw_hu_bu(temp, temp); + temp = __lsx_vhaddw_wu_hu(temp, temp); + r_.lsx_i64 = __lsx_vhaddw_du_wu(temp, temp); #else for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { uint16_t tmp = 0; @@ -4705,6 +5243,13 @@ simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, e8, e9, e10, e11, e12, e13, e14, e15}; r_.neon_i8 = vld1q_s8(data); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v16i8) int8_t data[16] = { + e0, e1, e2, e3, + e4, e5, e6, e7, + e8, e9, e10, e11, + e12, e13, e14, e15}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i8[ 0] = e0; r_.i8[ 1] = e1; @@ -4745,6 +5290,9 @@ simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, r_.neon_i16 = vld1q_s16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v8i16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i16[0] = e0; r_.i16[1] = e1; @@ -4768,8 +5316,11 @@ simde__m128i simde_mm_loadu_si16 (void const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) + HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ + HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int16_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -4793,6 +5344,9 @@ simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { r_.neon_i32 = vld1q_s32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v4i32) int32_t data[4] = {e0, e1, e2, e3}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i32[0] = e0; r_.i32[1] = e1; @@ -4812,8 +5366,17 @@ simde__m128i simde_mm_loadu_si32 (void const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) + HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ + HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si32(mem_addr); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m128i_private r_; + r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); + return simde__m128i_from_private(r_); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -4834,6 +5397,9 @@ simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde__m64 data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.m64[0] = e0; r_.m64[1] = e1; @@ -4859,6 +5425,9 @@ simde_mm_set_epi64x (int64_t e1, int64_t e0) { r_.neon_i64 = vld1q_s64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(e0, e1); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v2i64) int64_t data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i64[0] = e0; r_.i64[1] = e1; @@ -4879,6 +5448,8 @@ simde_mm_loadu_si64 (void const* mem_addr) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) return _mm_loadu_si64(mem_addr); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int64_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -4911,6 +5482,15 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, e8, e9, e10, e11, e12, e13, e14, e15}; r_.neon_u8 = vld1q_u8(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v16u8) uint8_t data[16] = { + e0, e1, e2, e3, + e4, e5, e6, e7, + e8, e9, e10, e11, + e12, e13, e14, e15}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3; r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7; @@ -4936,6 +5516,11 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; r_.neon_u16 = vld1q_u16(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v8u16) uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3; r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7; @@ -4957,6 +5542,11 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 }; r_.neon_u32 = vld1q_u32(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v4u32) uint32_t data[4] = {e0, e1, e2, e3}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u32[0] = e0; r_.u32[1] = e1; @@ -4979,6 +5569,11 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1}; r_.neon_u64 = vld1q_u64(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u64x2_make(e0, e1); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v2u64) uint64_t data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u64[0] = e0; r_.u64[1] = e1; @@ -4995,6 +5590,10 @@ simde_mm_set_sd (simde_float64 a) { return _mm_set_sd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return (__m128d)__lsx_vinsgr2vr_d(__lsx_vldrepl_d(&a, 0), 0, 1); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); #endif @@ -5017,6 +5616,8 @@ simde_mm_set1_epi8 (int8_t a) { r_.wasm_v128 = wasm_i8x16_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_b(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -5045,6 +5646,8 @@ simde_mm_set1_epi16 (int16_t a) { r_.wasm_v128 = wasm_i16x8_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_h(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -5073,6 +5676,8 @@ simde_mm_set1_epi32 (int32_t a) { r_.wasm_v128 = wasm_i32x4_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_w(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -5101,6 +5706,8 @@ simde_mm_set1_epi64x (int64_t a) { r_.wasm_v128 = wasm_i64x2_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_d(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -5134,6 +5741,8 @@ simde__m128i simde_x_mm_set1_epu8 (uint8_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u8x16_splat(value)); #else return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); #endif @@ -5144,6 +5753,8 @@ simde__m128i simde_x_mm_set1_epu16 (uint16_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u16x8_splat(value)); #else return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); #endif @@ -5154,6 +5765,8 @@ simde__m128i simde_x_mm_set1_epu32 (uint32_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u32x4_splat(value)); #else return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); #endif @@ -5164,6 +5777,8 @@ simde__m128i simde_x_mm_set1_epu64 (uint64_t value) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u64x2_splat(value)); #else return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); #endif @@ -5247,6 +5862,8 @@ simde__m128d simde_mm_setzero_pd (void) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_setzero_pd(); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_const(0.0, 0.0)); #else return simde_mm_castsi128_pd(simde_mm_setzero_si128()); #endif @@ -5324,19 +5941,22 @@ simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) return simde__m128i_from_private(r_); } + #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_epi32(a, imm8) (__lsx_vshuf4i_w(simde__m128i_to_private(a).lsx_i64, (imm8))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - (simde__tmp_a_).i32, \ - (simde__tmp_a_).i32, \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_wasm_v128( \ + wasm_i32x4_shuffle( \ + (simde_tmp_a_).wasm_v128, \ + (simde_tmp_a_).wasm_v128, \ ((imm8) ) & 3, \ ((imm8) >> 2) & 3, \ ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3) }); })) + ((imm8) >> 6) & 3)); })) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shuffle_epi32(a, imm8) \ (__extension__ ({ \ @@ -5348,6 +5968,17 @@ simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_epi32_r_, 3); \ vreinterpretq_s64_s32(simde_mm_shuffle_epi32_r_); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i32 = \ + SIMDE_SHUFFLE_VECTOR_(32, 16, \ + (simde_tmp_a_).i32, \ + (simde_tmp_a_).i32, \ + ((imm8) ) & 3, \ + ((imm8) >> 2) & 3, \ + ((imm8) >> 4) & 3, \ + ((imm8) >> 6) & 3) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8) @@ -5369,6 +6000,21 @@ simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_pd(a, b, imm8) \ + ({ \ + simde__m128d res; \ + if ((imm8) & 0x01) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1001); \ + } else if ((imm8) & 0x02) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1100); \ + } else if ((imm8) & 0x03) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1101); \ + } else { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1000); \ + } \ + res; \ + }) #elif defined(SIMDE_SHUFFLE_VECTOR_) #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \ simde__m128d_from_private((simde__m128d_private) { .f64 = \ @@ -5402,18 +6048,9 @@ simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shufflehi_epi16(a, imm8) \ + ((simde__m128i)__lsx_vextrins_d(__lsx_vshuf4i_h(simde__m128i_to_private(a).lsx_i64, imm8), simde__m128i_to_private(a).lsx_i64, 0x00)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shufflehi_epi16(a, imm8) \ (__extension__ ({ \ @@ -5425,6 +6062,30 @@ simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 6) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 7); \ simde__m128i_from_neon_i16(simde_mm_shufflehi_epi16_r_); \ })) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .wasm_v128 = \ + wasm_i16x8_shuffle( \ + (simde_tmp_a_).wasm_v128, \ + (simde_tmp_a_).wasm_v128, \ + 0, 1, 2, 3, \ + (((imm8) ) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i16 = \ + SIMDE_SHUFFLE_VECTOR_(16, 16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + 0, 1, 2, 3, \ + (((imm8) ) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8) @@ -5450,18 +6111,20 @@ simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7) }); })) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shufflelo_epi16(a, imm8) \ + ((simde__m128i)__lsx_vextrins_d(__lsx_vshuf4i_h(simde__m128i_to_private(a).lsx_i64, imm8), simde__m128i_to_private(a).lsx_i64, 0b00010001)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_shufflelo_epi16(a, imm8) \ + simde__m128i_from_wasm_v128( \ + wasm_i16x8_shuffle( \ + simde__m128i_to_wasm_v128((a)), \ + wasm_i16x8_splat(0), \ + (((imm8) & 0x03) ), \ + (((imm8) & 0x0c) >> 2), \ + (((imm8) & 0x30) >> 4), \ + (((imm8) & 0xc0) >> 6), \ + 4, 5, 6, 7)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shufflelo_epi16(a, imm8) \ (__extension__({ \ @@ -5473,6 +6136,18 @@ simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 6) & 0x3)), simde_mm_shufflelo_epi16_r_, 3); \ simde__m128i_from_neon_i16(simde_mm_shufflelo_epi16_r_); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i16 = \ + SIMDE_SHUFFLE_VECTOR_(16, 16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3), \ + 4, 5, 6, 7) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8) @@ -5496,6 +6171,8 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { r_.u16 = (a_.u16 << count_.u64[0]); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); #else @@ -5530,6 +6207,8 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { r_.u32 = (a_.u32 << count_.u64[0]); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); #else @@ -5565,6 +6244,8 @@ simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, s)) : wasm_i64x2_const(0,0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(HEDLEY_STATIC_CAST(int64_t, s))); #else #if !defined(SIMDE_BUG_GCC_94488) SIMDE_VECTORIZE @@ -5597,6 +6278,8 @@ simde_mm_sqrt_pd (simde__m128d a) { r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_sqrt(a_.altivec_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfsqrt_d(a_.lsx_f64); #elif defined(simde_math_sqrt) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -5628,7 +6311,9 @@ simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_sqrt) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfsqrt_d(b_.lsx_f64), 0); + #elif defined(simde_math_sqrt) r_.f64[0] = simde_math_sqrt(b_.f64[0]); r_.f64[1] = a_.f64[1]; #else @@ -5650,13 +6335,21 @@ simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.i64[0] > 15)) { + return simde_mm_setzero_si128(); + } + + const int cnt = HEDLEY_STATIC_CAST(int, count_.i64[0]); + + a_ = simde__m128i_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -5679,15 +6372,23 @@ simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.i64[0] > 31)) { + return simde_mm_setzero_si128(); + } + + const int cnt = HEDLEY_STATIC_CAST(int, count_.i64[0]); + + a_ = simde__m128i_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -5713,12 +6414,18 @@ simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { a_ = simde__m128i_to_private(a), count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.i64[0] > 63)) { + return simde_mm_setzero_si128(); + } + + const int cnt = HEDLEY_STATIC_CAST(int, count_.i64[0]); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(cnt)); #else #if !defined(SIMDE_BUG_GCC_94488) SIMDE_VECTORIZE @@ -5750,6 +6457,8 @@ simde_mm_srai_epi16 (simde__m128i a, const int imm8) r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v8i16)a_.lsx_i64 >> cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { @@ -5781,6 +6490,8 @@ simde_mm_srai_epi32 (simde__m128i a, const int imm8) r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v4i32)a_.lsx_i64 >> cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { @@ -5814,6 +6525,8 @@ simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -5845,6 +6558,8 @@ simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -5871,7 +6586,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v8i16)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; @@ -5915,7 +6632,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v4i32)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << imm8; #else SIMDE_VECTORIZE @@ -5970,7 +6689,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v2i64)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << imm8; #else SIMDE_VECTORIZE @@ -6010,7 +6731,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v8u16)a_.lsx_i64 >> imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); #else SIMDE_VECTORIZE @@ -6053,7 +6776,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v4u32)a_.lsx_i64 >> imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else SIMDE_VECTORIZE @@ -6110,6 +6835,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v2u64)a_.lsx_i64 >> imm8); #else #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); @@ -6150,6 +6877,8 @@ simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128d_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); #endif @@ -6168,6 +6897,8 @@ simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vilvl_d(a_.lsx_i64, a_.lsx_i64), mem_addr, 0); #else mem_addr[0] = a_.f64[0]; mem_addr[1] = a_.f64[0]; @@ -6194,6 +6925,10 @@ simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) const int64_t v = vgetq_lane_s64(a_.neon_i64, 0); simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, mem_addr, 0, 0); #else simde_float64 v = a_.f64[0]; simde_memcpy(mem_addr, &v, sizeof(simde_float64)); @@ -6214,6 +6949,8 @@ simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_)); #endif @@ -6233,6 +6970,10 @@ void #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, mem_addr, 0, 1); #else *mem_addr = a_.f64[1]; #endif @@ -6247,6 +6988,8 @@ void simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else simde__m128i_private a_ = simde__m128i_to_private(a); int64_t tmp; @@ -6277,6 +7020,10 @@ void simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storel_pd(mem_addr, a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), simde__m128d_to_wasm_v128(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128d_to_private(a).lsx_f64, mem_addr, 0, 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); @@ -6303,9 +7050,15 @@ simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + a_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 0); + simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0); simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vshuf4i_d(a_.lsx_i64, a_.lsx_i64, 0b0001); + __lsx_vst(temp, mem_addr, 0); #else mem_addr[0] = a_.f64[1]; mem_addr[1] = a_.f64[0]; @@ -6323,6 +7076,8 @@ simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storeu_pd(mem_addr, a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128d_to_private(a).lsx_f64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -6336,6 +7091,8 @@ void simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128i_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -6352,6 +7109,8 @@ simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si16(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_h(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int16_t val = simde_x_mm_cvtsi128_si16(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6369,6 +7128,10 @@ simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si32(mem_addr, a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store32_lane(mem_addr, simde__m128i_to_wasm_v128(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_w(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int32_t val = simde_mm_cvtsi128_si32(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6386,6 +7149,8 @@ simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si64(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int64_t val = simde_mm_cvtsi128_si64(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6400,8 +7165,13 @@ void simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_pd(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_pd(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6413,8 +7183,13 @@ void simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_si128(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6426,6 +7201,12 @@ void simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_si32(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vstelm_w(__lsx_vreplgr2vr_w(a), mem_addr, 0, 0); #else *mem_addr = a; #endif @@ -6439,6 +7220,12 @@ void simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION) _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_s64(mem_addr, vdup_n_s64(a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vstelm_d(__lsx_vreplgr2vr_d(a), mem_addr, 0, 0); #else *mem_addr = a; #endif @@ -6462,6 +7249,10 @@ simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 - b_.i8; #else @@ -6491,6 +7282,10 @@ simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 - b_.i16; #else @@ -6520,6 +7315,10 @@ simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 - b_.i32; #else @@ -6549,6 +7348,10 @@ simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 - b_.i64; #else @@ -6577,6 +7380,8 @@ simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) { r_.u32 = a_.u32 - b_.u32; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -6604,6 +7409,8 @@ simde_mm_sub_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -6632,10 +7439,12 @@ simde_mm_sub_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] - b_.f64[0]; - r_.f64[1] = a_.f64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.f64[0] = a_.f64[0] - b_.f64[0]; + r_.f64[1] = a_.f64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -6684,6 +7493,8 @@ simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) { r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) { @@ -6713,6 +7524,8 @@ simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { @@ -6744,6 +7557,8 @@ simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u8[0])) ; i++) { @@ -6775,6 +7590,8 @@ simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u16[0])) ; i++) { @@ -6808,6 +7625,8 @@ simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -6844,6 +7663,8 @@ simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -6880,6 +7701,8 @@ simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -6916,6 +7739,8 @@ simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -6952,6 +7777,8 @@ simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -6988,6 +7815,8 @@ simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7005,15 +7834,6 @@ simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b) #endif -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_POP -#endif - SIMDE_FUNCTION_ATTRIBUTES void simde_mm_lfence (void) { @@ -7058,6 +7878,10 @@ simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16)); int8x8x2_t result = vzip_s8(a1, b1); r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); #else @@ -7093,6 +7917,10 @@ simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { int16x4_t b1 = vget_high_s16(b_.neon_i16); int16x4x2_t result = vzip_s16(a1, b1); r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 4, 12, 5, 13, 6, 14, 7, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15); #else @@ -7128,6 +7956,10 @@ simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { int32x2_t b1 = vget_high_s32(b_.neon_i32); int32x2x2_t result = vzip_s32(a1, b1); r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); #else @@ -7160,6 +7992,10 @@ simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { int64x1_t a_h = vget_high_s64(a_.neon_i64); int64x1_t b_h = vget_high_s64(b_.neon_i64); r_.neon_i64 = vcombine_s64(a_h, b_h); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); #else @@ -7194,6 +8030,8 @@ simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { @@ -7227,6 +8065,10 @@ simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16)); int8x8x2_t result = vzip_s8(a1, b1); r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); #else @@ -7262,6 +8104,10 @@ simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { int16x4_t b1 = vget_low_s16(b_.neon_i16); int16x4x2_t result = vzip_s16(a1, b1); r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 8, 1, 9, 2, 10, 3, 11); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11); #else @@ -7297,6 +8143,10 @@ simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { int32x2_t b1 = vget_low_s32(b_.neon_i32); int32x2x2_t result = vzip_s32(a1, b1); r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); #else @@ -7329,6 +8179,10 @@ simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { int64x1_t a_l = vget_low_s64(a_.neon_i64); int64x1_t b_l = vget_low_s64(b_.neon_i64); r_.neon_i64 = vcombine_s64(a_l, b_l); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); #else @@ -7359,6 +8213,10 @@ simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vzip1q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -7393,6 +8251,8 @@ simde_x_mm_negate_pd(simde__m128d a) { r_.neon_f64 = vnegq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vneg_d(a_.lsx_i64); #elif defined(SIMDE_VECTOR_NEGATE) r_.f64 = -a_.f64; #else @@ -7421,6 +8281,10 @@ simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_xor(b_.wasm_v128, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f ^ b_.i32f; #else @@ -7453,6 +8317,8 @@ simde_x_mm_not_si128 (simde__m128i a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f; #else diff --git a/x86/sse3.h b/x86/sse3.h index f46e2798a..63a9331dc 100644 --- a/x86/sse3.h +++ b/x86/sse3.h @@ -46,6 +46,10 @@ simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); r_.neon_i16 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6, 8, 10, 12, 14); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14); #else @@ -72,6 +76,10 @@ simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); r_.neon_i16 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15); #else @@ -98,6 +106,10 @@ simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); r_.neon_i32 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6); #else @@ -124,6 +136,10 @@ simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); r_.neon_i32 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7); #else @@ -150,6 +166,10 @@ simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); r_.neon_f32 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6); #else @@ -176,6 +196,10 @@ simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); r_.neon_f32 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7); #else @@ -199,6 +223,10 @@ simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -222,6 +250,10 @@ simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); #else @@ -250,6 +282,10 @@ simde_mm_addsub_pd (simde__m128d a, simde__m128d b) { float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64); float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64); return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d temp_ra = __lsx_vfadd_d(a_.lsx_f64, b_.lsx_f64); + __m128d temp_rs = __lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64); + return (__m128d)__lsx_vextrins_d((__m128i)temp_ra, (__m128i)temp_rs, 0); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3); #else @@ -281,6 +317,11 @@ simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32); float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32); return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128 temp_ra = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); + __m128 temp_rs = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); + __m128i temp = __lsx_vextrins_w((__m128i)temp_ra, (__m128i)temp_rs, 0); + r_.lsx_i64 = __lsx_vextrins_w(temp, (__m128i)temp_rs, 0b00100010); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7); #else @@ -294,7 +335,7 @@ simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { #endif } #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_ps(a, b) simde_mm_addsub_ps(a, b) +# define _mm_addsub_ps(a, b) simde_mm_addsub_ps((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -369,6 +410,8 @@ simde_mm_lddqu_si128 (simde__m128i const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -392,6 +435,8 @@ simde_mm_loaddup_pd (simde_float64 const* mem_addr) { r_.neon_f64 = vdupq_n_f64(*mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_d(mem_addr, 0); #else r_.f64[0] = *mem_addr; r_.f64[1] = *mem_addr; @@ -418,6 +463,8 @@ simde_mm_movedup_pd (simde__m128d a) { r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_d(a_.lsx_i64, 0); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); #else @@ -432,6 +479,11 @@ simde_mm_movedup_pd (simde__m128d a) { # define _mm_movedup_pd(a) simde_mm_movedup_pd(a) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_movehdup_ps (simde__m128 a) { @@ -446,6 +498,8 @@ simde_mm_movehdup_ps (simde__m128 a) { r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpackod_w(a_.lsx_i64, a_.lsx_i64); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3); #else @@ -476,6 +530,8 @@ simde_mm_moveldup_ps (simde__m128 a) { r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpackev_w(a_.lsx_i64, a_.lsx_i64); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2); #else @@ -492,6 +548,10 @@ simde_mm_moveldup_ps (simde__m128 a) { # define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/x86/sse4.1.h b/x86/sse4.1.h index 57f1029c1..ba2bf1869 100644 --- a/x86/sse4.1.h +++ b/x86/sse4.1.h @@ -47,10 +47,17 @@ simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8) a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi16((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1, + (imm8>>4)&1, (imm8>>5)&1, (imm8>>6)&1, (imm8>>7)&1); + mask = __lsx_vseqi_h(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; + } +#endif return simde__m128i_from_private(r_); } @@ -96,10 +103,16 @@ simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8) a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; + } +#endif return simde__m128d_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -138,10 +151,16 @@ simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8) a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; + } +#endif return simde__m128_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -197,6 +216,8 @@ simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, m); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_b(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */ #if defined(HEDLEY_INTEL_VERSION_CHECK) @@ -241,6 +262,8 @@ simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_h(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -282,6 +305,8 @@ simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_w(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i32) z = { 0, 0, 0, 0 }; @@ -326,6 +351,8 @@ simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) { #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63))); r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_d(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i64) z = { 0, 0 }; @@ -352,6 +379,13 @@ simde__m128d simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_blendv_pd(a, b, mask); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m_ = wasm_i64x2_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 63); + return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_m = __lsx_vfcmp_clt_d(simde__m128d_to_private(mask).lsx_f64, (__m128d)__lsx_vreplgr2vr_w(0)); + __m128i r = __lsx_vbitsel_v(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, temp_m); + return (simde__m128d)r; #else return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask))); #endif @@ -366,6 +400,13 @@ simde__m128 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_blendv_ps(a, b, mask); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m_ = wasm_i32x4_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 31); + return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_m = __lsx_vfcmp_clt_s(simde__m128_to_private(mask).lsx_f32, (__m128)__lsx_vreplgr2vr_w(0)); + __m128i r = __lsx_vbitsel_v(simde__m128_to_private(a).lsx_i64, simde__m128_to_private(b).lsx_i64, temp_m); + return (simde__m128)r; #else return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask))); #endif @@ -395,6 +436,10 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndiq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); #elif defined(simde_math_nearbyint) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -410,6 +455,10 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndaq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); #elif defined(simde_math_roundeven) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -425,6 +474,10 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndmq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -438,6 +491,10 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndpq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); #elif defined(simde_math_ceil) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -453,6 +510,10 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -478,6 +539,9 @@ simde_mm_round_pd (simde__m128d a, int rounding) SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_ceil_pd (simde__m128d a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_ceil(simde__m128d_to_wasm_v128(a))); + #endif return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -488,6 +552,9 @@ simde_mm_ceil_pd (simde__m128d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_ceil_ps (simde__m128 a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_f32x4_ceil(simde__m128_to_wasm_v128(a))); + #endif return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -506,7 +573,9 @@ simde_mm_ceil_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_ceilf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfrintrp_d(b_.lsx_f64), 0); + #elif defined(simde_math_ceilf) r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0]))); #else HEDLEY_UNREACHABLE(); @@ -535,7 +604,9 @@ simde_mm_ceil_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(simde_math_ceilf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfrintrp_s(b_.lsx_f32), 0); + #elif defined(simde_math_ceilf) r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0]))); #else HEDLEY_UNREACHABLE(); @@ -571,6 +642,8 @@ simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_d(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { @@ -602,6 +675,10 @@ simde_mm_cvtepi8_epi16 (simde__m128i a) { int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ r_.neon_i16 = s16x8; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_extend_low_i8x16(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_h_b(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, 0, -1, 1, -1, 2, -1, 3, @@ -643,6 +720,11 @@ simde_mm_cvtepi8_epi32 (simde__m128i a) { int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ r_.neon_i32 = s32x4; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); + + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_w_h(__lsx_vsllwil_h_b(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, -1, -1, 0, -1, -1, -1, 1, @@ -679,6 +761,12 @@ simde_mm_cvtepi8_epi64 (simde__m128i a) { int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ r_.neon_i64 = s64x2; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t extra = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); + v128_t sign = wasm_i32x4_gt(wasm_i64x2_const(0, 0), extra); + r_.wasm_v128 = wasm_i32x4_shuffle(extra, sign, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(__lsx_vsllwil_w_h(__lsx_vsllwil_h_b(a_.lsx_i64, 0), 0), 0); #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) /* Disabled on x86 due to lack of 64-bit arithmetic shift until * until AVX-512 (at which point we would be using the native @@ -718,6 +806,10 @@ simde_mm_cvtepu8_epi16 (simde__m128i a) { uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ r_.neon_u16 = u16x8; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u16x8_extend_low_u8x16(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_hu_bu(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -765,6 +857,10 @@ simde_mm_cvtepu8_epi32 (simde__m128i a) { uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ r_.neon_u32 = u32x4; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_wu_hu(__lsx_vsllwil_hu_bu(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -811,6 +907,8 @@ simde_mm_cvtepu8_epi64 (simde__m128i a) { uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ r_.neon_u64 = u64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(__lsx_vsllwil_wu_hu(__lsx_vsllwil_hu_bu(a_.lsx_i64, 0), 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -845,6 +943,10 @@ simde_mm_cvtepi16_epi32 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_w_h(a_.lsx_i64, 0); #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3)); r_.i32 >>= 16; @@ -877,6 +979,10 @@ simde_mm_cvtepu16_epi32 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_wu_hu(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u16) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, @@ -916,6 +1022,8 @@ simde_mm_cvtepu16_epi64 (simde__m128i a) { uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ r_.neon_u64 = u64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(__lsx_vsllwil_wu_hu(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u16) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, @@ -951,6 +1059,8 @@ simde_mm_cvtepi16_epi64 (simde__m128i a) { int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ r_.neon_i64 = s64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(__lsx_vsllwil_w_h(a_.lsx_i64, 0), 0); #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 9, 10, 0, @@ -988,6 +1098,8 @@ simde_mm_cvtepi32_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(a_.lsx_i64, 0); #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1)); r_.i64 >>= 32; @@ -1022,6 +1134,8 @@ simde_mm_cvtepu32_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(a_.lsx_i64, 0); #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u32) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6)); @@ -1081,6 +1195,36 @@ simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8) } break; } + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d tmp = __lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64); + + switch (imm8) { + case 0xff: + r_.lsx_f64 = __lsx_vfadd_d(tmp, (__m128d)__lsx_vshuf4i_d((__m128i)tmp, (__m128i)tmp, 0b0001)); + break; + case 0x13: + r_.lsx_i64 = __lsx_vilvl_d((__m128i)tmp, (__m128i)tmp); + break; + default: + { + uint64_t mask_data[] = { + (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0), + (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0), + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)tmp); + } + + r_.lsx_f64 = __lsx_vfadd_d(r_.lsx_f64, (__m128d)__lsx_vshuf4i_d((__m128i)r_.lsx_f64, (__m128i)r_.lsx_f64, 0b0001)); + + { + uint64_t mask_data[] = { + (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0), + (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), r_.lsx_i64); + } + break; + } #else simde_float64 sum = SIMDE_FLOAT64_C(0.0); @@ -1151,6 +1295,52 @@ simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8) } break; } + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + __m128 tmp = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); + + switch (imm8) { + case 0xff: + { + __m128i tmp1 = __lsx_vilvh_d((__m128i)tmp, (__m128i)tmp); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, (__m128)tmp); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + break; + } + case 0x7f: + { + __m128i tmp0 = __lsx_vinsgr2vr_w(tmp, 0, 3); + __m128i tmp1 = __lsx_vilvh_d((__m128i)tmp0, (__m128i)tmp0); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, (__m128)tmp); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + break; + } + default: + { + { + uint32_t mask_data[] = { + (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)tmp); + } + + __m128i tmp1 = __lsx_vilvh_d(r_.lsx_i64, r_.lsx_i64); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, r_.lsx_f32); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + { + uint32_t mask_data[] = { + (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)r_.lsx_f32); + } + } + break; + } #else simde_float32 sum = SIMDE_FLOAT32_C(0.0); @@ -1206,7 +1396,11 @@ simde_mm_extract_epi8 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8) # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8) +# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_neon_i8(a), imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_extract_epi8(a, imm8) wasm_u8x16_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi8(a, imm8) __lsx_vpickve2gr_b(simde__m128i_to_lsx_i8(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi8 @@ -1236,9 +1430,13 @@ simde_mm_extract_epi32 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8) +# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_neon_i32(a), imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) -# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8)) +# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_altivec_i32(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_extract_epi32(a, imm8) wasm_i32x4_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi32(a, imm8) __lsx_vpickve2gr_w(simde__m128i_to_lsx_i32(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi32 @@ -1268,9 +1466,11 @@ simde_mm_extract_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8) +# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_neon_i64(a), imm8) #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8)) +# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_altivec_i64(a), imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi64(a, imm8) __lsx_vpickve2gr_d(simde__m128i_to_lsx_i64(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_extract_epi64 @@ -1292,7 +1492,11 @@ simde_mm_extract_ps (simde__m128 a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8) + #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_neon_i32(a), imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_extract_ps(a, imm8) wasm_i32x4_extract_lane(simde__m128_to_wasm_v128((a)), (imm8) & 3) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_extract_ps(a, imm8) __lsx_vpickve2gr_w(simde__m128_to_lsx_i32(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_ps @@ -1302,6 +1506,9 @@ simde_mm_extract_ps (simde__m128 a, const int imm8) SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_floor_pd (simde__m128d a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_floor(simde__m128d_to_wasm_v128(a))); + #endif return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -1312,6 +1519,9 @@ simde_mm_floor_pd (simde__m128d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_floor_ps (simde__m128 a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_f32x4_floor(simde__m128_to_wasm_v128(a))); + #endif return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -1330,7 +1540,9 @@ simde_mm_floor_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_floor) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfrintrm_d(b_.lsx_f64), 0); + #elif defined(simde_math_floor) r_.f64[0] = simde_math_floor(b_.f64[0]); r_.f64[1] = a_.f64[1]; #else @@ -1360,7 +1572,9 @@ simde_mm_floor_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(simde_math_floorf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfrintrm_s(b_.lsx_f32), 0); + #elif defined(simde_math_floorf) r_.f32[0] = simde_math_floorf(b_.f32[0]); for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = a_.f32[i]; @@ -1399,6 +1613,10 @@ simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) #endif #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_neon_i8(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i8x16_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15, HEDLEY_STATIC_CAST(int8_t, (i)))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi8(a, i, imm8) __lsx_vinsgr2vr_b(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi8 @@ -1424,6 +1642,10 @@ simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) #endif #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_neon_i32(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i32x4_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3, (i))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi32(a, i, imm8) __lsx_vinsgr2vr_w(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi32 @@ -1461,6 +1683,10 @@ simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_neon_i64(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i64x2_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 1, (i))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi64(a, i, imm8) __lsx_vinsgr2vr_d(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_insert_epi64 @@ -1476,14 +1702,19 @@ simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - a_.f32[0] = b_.f32[(imm8 >> 6) & 3]; - a_.f32[(imm8 >> 4) & 3] = a_.f32[0]; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; - } + float tmp1_ = b_.f32[(imm8 >> 6) & 3]; + a_.f32[(imm8 >> 4) & 3] = tmp1_; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, __lsx_vreplgr2vr_w(0), mask); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; + } + #endif return simde__m128_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -1514,6 +1745,8 @@ simde_mm_max_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -1549,6 +1782,8 @@ simde_mm_max_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1584,6 +1819,8 @@ simde_mm_max_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -1616,6 +1853,8 @@ simde_mm_max_epu32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_wu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1648,6 +1887,8 @@ simde_mm_min_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -1680,6 +1921,8 @@ simde_mm_min_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1715,6 +1958,8 @@ simde_mm_min_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -1747,6 +1992,8 @@ simde_mm_min_epu32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_wu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1842,6 +2089,8 @@ simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i64x2_make( wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)), wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmulwev_d_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -1878,6 +2127,8 @@ simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1903,6 +2154,8 @@ simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u32 = a_.u32 * b_.u32; #else @@ -1951,6 +2204,8 @@ simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_u16 = vec_packsu(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_hu_w(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) int32_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); @@ -1983,30 +2238,38 @@ simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) b_ = simde__m128d_to_private(b); switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: + case SIMDE_MM_FROUND_TO_NEAREST_INT: + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrne_d(b_.lsx_f64), 0); + #elif defined(simde_math_nearbyint) r_.f64[0] = simde_math_nearbyint(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrm_d(b_.lsx_f64), 0); + #elif defined(simde_math_floor) r_.f64[0] = simde_math_floor(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrp_d(b_.lsx_f64), 0); + #elif defined(simde_math_ceil) r_.f64[0] = simde_math_ceil(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrz_d(b_.lsx_f64), 0); + #elif defined(simde_math_trunc) r_.f64[0] = simde_math_trunc(b_.f64[0]); - break; - #endif + #endif + break; default: HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); @@ -2035,30 +2298,38 @@ simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) b_ = simde__m128_to_private(b); switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: + case SIMDE_MM_FROUND_TO_NEAREST_INT: + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrne_s(b_.lsx_f32), 0); + #elif defined(simde_math_nearbyintf) r_.f32[0] = simde_math_nearbyintf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrm_s(b_.lsx_f32), 0); + #elif defined(simde_math_floorf) r_.f32[0] = simde_math_floorf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrp_s(b_.lsx_f32), 0); + #elif defined(simde_math_ceilf) r_.f32[0] = simde_math_ceilf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrz_s(b_.lsx_f32), 0); + #elif defined(simde_math_truncf) r_.f32[0] = simde_math_truncf(b_.f32[0]); - break; - #endif + #endif + break; default: HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); @@ -2083,10 +2354,13 @@ simde__m128i simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr))); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + return __builtin_nontemporal_load(mem_addr); #else - return *mem_addr; + return simde_mm_load_si128(mem_addr); #endif } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -2106,7 +2380,11 @@ simde_mm_test_all_ones (simde__m128i a) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r = vec_all_eq(a_.altivec_i32, vec_splats(~0)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); + r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(a_.wasm_v128, 0) & wasm_i64x2_extract_lane(a_.wasm_v128, 1)) == 0xFFFFFFFFFFFFFFFFull; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = ((__lsx_vpickve2gr_d(a_.lsx_i64, 0) & __lsx_vpickve2gr_d(a_.lsx_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); #else int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2138,7 +2416,11 @@ simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r = vec_all_eq(tmp_.altivec_i32, vec_splats(0)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); + r = !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = (wasm_i64x2_extract_lane(tmp_.wasm_v128, 0) | wasm_i64x2_extract_lane(tmp_.wasm_v128, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !(__lsx_vpickve2gr_d(tmp_.lsx_i64, 0) | __lsx_vpickve2gr_d(tmp_.lsx_i64, 1)); #else int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2172,6 +2454,20 @@ simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64); int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64); return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_and(a_.wasm_v128, mask_.wasm_v128); + long long c0 = wasm_i64x2_extract_lane(m, 0); + long long c1 = wasm_i64x2_extract_lane(m, 1); + long long ones = c0 | c1; + long long zeros = ~(c0 & c1); + return ones && zeros; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vand_v(a_.lsx_i64, mask_.lsx_i64); + long long tmp0 = __lsx_vpickve2gr_d(tmp, 0); + long long tmp1 = __lsx_vpickve2gr_d(tmp, 1); + long long ones = tmp0 | tmp1; + long long zeros = ~(tmp0 & tmp1); + return ones && zeros; #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0)) @@ -2198,7 +2494,13 @@ simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s64 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + return (__lsx_vpickve2gr_d(tmp, 0) | __lsx_vpickve2gr_d(tmp, 1)) == 0; #else int_fast32_t r = 0; @@ -2229,7 +2531,18 @@ simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s640 = vandq_s64(b_.neon_i64, a_.neon_i64); int64x2_t s641 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); + return !( !(vgetq_lane_s64(s641, 0) || vgetq_lane_s64(s641, 1)) \ + || !(vgetq_lane_s64(s640, 0) || vgetq_lane_s64(s640, 1)) ); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m1 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + v128_t m2 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + return (wasm_i64x2_extract_lane(m1, 0) | wasm_i64x2_extract_lane(m1, 1)) \ + && (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i m1 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + __m128i m2 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + return (__lsx_vpickve2gr_d(m1, 0) | __lsx_vpickve2gr_d(m1, 1)) \ + && (__lsx_vpickve2gr_d(m2, 0) | __lsx_vpickve2gr_d(m2, 1)); #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0)) @@ -2258,14 +2571,25 @@ simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + return !(__lsx_vpickve2gr_d(tmp, 0) | __lsx_vpickve2gr_d(tmp, 1)); + #elif defined(SIMDE_HAVE_INT128_) + if ((a_.u128[0] & b_.u128[0]) == 0) { + return 1; + } + return 0; #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if ((a_.u64[i] & b_.u64[i]) == 0) - return 1; + if ((a_.u64[i] & b_.u64[i]) > 0) + return 0; } #endif - return 0; + return 1; #endif } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) diff --git a/x86/sse4.2.h b/x86/sse4.2.h index 504fe2f0b..a0723952c 100644 --- a/x86/sse4.2.h +++ b/x86/sse4.2.h @@ -30,9 +30,10 @@ #include "sse4.1.h" -#if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(__ARM_FEATURE_CRC32)) +#if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(SIMDE_ARCH_ARM_CRC32)) #include #endif +// ^^ Due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70974 HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -172,6 +173,10 @@ simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vshrq_n_s64(vqsubq_s64(b_.neon_i64, a_.neon_i64), 63); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_gt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); #else @@ -293,17 +298,27 @@ simde_mm_crc32_u8(uint32_t prevcrc, uint8_t v) { #if defined(SIMDE_X86_SSE4_2_NATIVE) return _mm_crc32_u8(prevcrc, v); #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32cb(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_b_w(v, prevcrc); #else uint32_t crc = prevcrc; crc ^= v; - for(int bit = 0 ; bit < 8 ; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } + // Adapted from: https://create.stephan-brumme.com/crc32/ + // Apply half-byte comparision algorithm for the best ratio between + // performance and lookup table. + + // The lookup table just needs to store every 16th entry + // of the standard look-up table. + static const uint32_t crc32_half_byte_tbl[] = { + 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3, + 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, + 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75, + }; + + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0f]; + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0f]; return crc; #endif #endif @@ -318,8 +333,10 @@ simde_mm_crc32_u16(uint32_t prevcrc, uint16_t v) { #if defined(SIMDE_X86_SSE4_2_NATIVE) return _mm_crc32_u16(prevcrc, v); #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32ch(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_h_w(v, prevcrc); #else uint32_t crc = prevcrc; crc = simde_mm_crc32_u8(crc, v & 0xff); @@ -338,8 +355,10 @@ simde_mm_crc32_u32(uint32_t prevcrc, uint32_t v) { #if defined(SIMDE_X86_SSE4_2_NATIVE) return _mm_crc32_u32(prevcrc, v); #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32cw(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_w_w(v, prevcrc); #else uint32_t crc = prevcrc; crc = simde_mm_crc32_u16(crc, v & 0xffff); @@ -358,7 +377,7 @@ simde_mm_crc32_u64(uint64_t prevcrc, uint64_t v) { #if defined(SIMDE_X86_SSE4_2_NATIVE) && defined(SIMDE_ARCH_AMD64) return _mm_crc32_u64(prevcrc, v); #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32cd(HEDLEY_STATIC_CAST(uint32_t, prevcrc), v); #else uint64_t crc = prevcrc; diff --git a/x86/ssse3.h b/x86/ssse3.h index 9c88f016f..db60c2fb5 100644 --- a/x86/ssse3.h +++ b/x86/ssse3.h @@ -51,6 +51,8 @@ simde_mm_abs_epi8 (simde__m128i a) { r_.altivec_i8 = vec_abs(a_.altivec_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_b(a_.lsx_i64, __lsx_vreplgr2vr_b(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -83,6 +85,8 @@ simde_mm_abs_epi16 (simde__m128i a) { r_.altivec_i16 = vec_abs(a_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_h(a_.lsx_i64, __lsx_vreplgr2vr_h(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -116,6 +120,8 @@ simde_mm_abs_epi32 (simde__m128i a) { r_.altivec_i32 = vec_abs(a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_w(a_.lsx_i64, __lsx_vreplgr2vr_w(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -251,6 +257,24 @@ simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) ((count) > 15) \ ? (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(a), vdupq_n_s8(0), (count) & 15))) \ : (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(b), simde__m128i_to_neon_i8(a), ((count) & 15)))))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_alignr_epi8(a, b, count) \ + ({ \ + __m128i res_; \ + if (count > 31) { \ + res_ = __lsx_vreplgr2vr_b(0); \ + } \ + else if (count > 15) { \ + res_ = __lsx_vbsrl_v(a, ((count)&15)); \ + } \ + else if (count == 0) { \ + res_ = b; \ + } \ + else { \ + res_ = __lsx_vor_v(__lsx_vbsll_v(a, (16-((count)&15))), __lsx_vbsrl_v(b, ((count)&15))); \ + } \ + (simde__m128i)res_; \ + }) #endif #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) #define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count) @@ -334,6 +358,13 @@ simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { SIMDE_POWER_ALTIVEC_VECTOR(signed char) msb_mask = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmplt(b_.altivec_i8, z)); SIMDE_POWER_ALTIVEC_VECTOR(signed char) c = vec_perm(a_.altivec_i8, a_.altivec_i8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), b_.altivec_i8)); r_.altivec_i8 = vec_sel(c, z, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), msb_mask)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_swizzle( + a_.wasm_v128, wasm_v128_and(b_.wasm_v128, wasm_i8x16_splat(0x8F))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i b1_ = __lsx_vslti_b(b_.lsx_i64, 0); + r_.lsx_i64 = __lsx_vshuf_b(a_.lsx_i64, a_.lsx_i64, __lsx_vandi_b(b_.lsx_i64, 15)); + r_.lsx_i64 = __lsx_vand_v(r_.lsx_i64, __lsx_vnor_v(b1_, b1_)); #else for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7); @@ -686,6 +717,10 @@ simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) { /* saturated add */ r_.neon_i16 = vqaddq_s16(prod1, prod2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_h_bu_b(a_.lsx_i64, b_.lsx_i64); + __m128i temp_od = __lsx_vmulwod_h_bu_b(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vsadd_h(temp_ev, temp_od); #else for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { const int idx = HEDLEY_STATIC_CAST(int, i) << 1; @@ -763,6 +798,21 @@ simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { /* Join together */ r_.neon_i16 = vcombine_s16(narrow_lo, narrow_hi); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t __lo = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(a_.wasm_v128), wasm_i32x4_extend_low_i16x8(b_.wasm_v128)); + v128_t __hi = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(a_.wasm_v128), wasm_i32x4_extend_high_i16x8(b_.wasm_v128)); + const v128_t __inc = wasm_i32x4_splat(0x4000); + __lo = wasm_i32x4_add(__lo, __inc); + __hi = wasm_i32x4_add(__hi, __inc); + __lo = wasm_i32x4_add(__lo, __lo); + __hi = wasm_i32x4_add(__hi, __hi); + r_.wasm_v128 = wasm_i16x8_shuffle(__lo, __hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); + __m128i temp_od = __lsx_vmulwod_w_h(a_.lsx_i64, b_.lsx_i64); + __m128i temp1 = __lsx_vilvl_w(temp_od, temp_ev); + __m128i temp2 = __lsx_vilvh_w(temp_od, temp_ev); + r_.lsx_i64 = __lsx_vssrarni_h_w(temp2, temp1, 15); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -838,6 +888,8 @@ simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { simde__m128i mask = wasm_i8x16_shr(b_.wasm_v128, 7); simde__m128i zeromask = simde_mm_cmpeq_epi8(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi8(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_b(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -878,6 +930,8 @@ simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { simde__m128i mask = simde_mm_srai_epi16(b_.wasm_v128, 15); simde__m128i zeromask = simde_mm_cmpeq_epi16(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi16(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_h(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -918,6 +972,8 @@ simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) { simde__m128i mask = simde_mm_srai_epi32(b_.wasm_v128, 31); simde__m128i zeromask = simde_mm_cmpeq_epi32(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi32(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_w(b_.lsx_i64, a_.lsx_i64); #else for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] != 0) ? (a_.i32[i]) : INT32_C(0)); diff --git a/x86/svml.h b/x86/svml.h index 81509e96a..40fe0cd6d 100644 --- a/x86/svml.h +++ b/x86/svml.h @@ -49,18 +49,10 @@ #include "../simde-complex.h" -#if !defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES -#endif - HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ -#if !defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_acos_ps (simde__m128 a) { @@ -2683,7 +2675,7 @@ simde_mm512_mask_cosh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi8(a, b); #else simde__m128i_private @@ -2713,7 +2705,7 @@ simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi16(a, b); #else simde__m128i_private @@ -2743,7 +2735,7 @@ simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi32(a, b); #else simde__m128i_private @@ -2776,7 +2768,7 @@ simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi64(a, b); #else simde__m128i_private @@ -2806,7 +2798,7 @@ simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu8(a, b); #else simde__m128i_private @@ -2836,7 +2828,7 @@ simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu16(a, b); #else simde__m128i_private @@ -2866,7 +2858,7 @@ simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu32(a, b); #else simde__m128i_private @@ -2899,7 +2891,7 @@ simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu64(a, b); #else simde__m128i_private @@ -5101,7 +5093,7 @@ simde_mm512_mask_cdfnorm_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_idivrem_epi32 (simde__m128i* mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_idivrem_epi32(HEDLEY_REINTERPRET_CAST(__m128i*, mem_addr), a, b); #else simde__m128i r; @@ -8825,7 +8817,7 @@ simde_mm_clog_ps (simde__m128 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_clog_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) return _mm256_clog_ps(a); #else simde__m256_private @@ -8880,7 +8872,7 @@ simde_mm_csqrt_ps (simde__m128 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_csqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) return _mm256_csqrt_ps(a); #else simde__m256_private @@ -8910,7 +8902,7 @@ simde_mm256_csqrt_ps (simde__m256 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi8(a, b); #else simde__m128i_private @@ -8938,7 +8930,7 @@ simde_mm_rem_epi8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi16(a, b); #else simde__m128i_private @@ -8966,7 +8958,7 @@ simde_mm_rem_epi16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi32(a, b); #else simde__m128i_private @@ -8997,7 +8989,7 @@ simde_mm_rem_epi32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi64(a, b); #else simde__m128i_private @@ -9025,7 +9017,7 @@ simde_mm_rem_epi64 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu8(a, b); #else simde__m128i_private @@ -9053,7 +9045,7 @@ simde_mm_rem_epu8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu16(a, b); #else simde__m128i_private @@ -9081,7 +9073,7 @@ simde_mm_rem_epu16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu32(a, b); #else simde__m128i_private @@ -9112,7 +9104,7 @@ simde_mm_rem_epu32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu64(a, b); #else simde__m128i_private @@ -12095,7 +12087,7 @@ simde_mm512_mask_trunc_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_udivrem_epi32 (simde__m128i * mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_udivrem_epi32(mem_addr, a, b); #else simde__m128i r; diff --git a/x86/xop.h b/x86/xop.h index 8b83ed279..5249f06d7 100644 --- a/x86/xop.h +++ b/x86/xop.h @@ -3727,7 +3727,7 @@ simde_mm256_permute2_pd (simde__m256d a, simde__m256d b, simde__m256i c, const i SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ })) #else - #define simde_mm256_permute2_pd(a, b, c, imm8) simde_undeprecated_mm256_permute2_pd((a), (b), (c), (imm8)) + #define simde_mm256_permute2_pd(a, b, c, imm8) _mm256_permute2_pd((a), (b), (c), (imm8)) #endif #endif #if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) From 0ab36ec96aeb45f35edd7624ae4b130398314780 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sun, 27 Jul 2025 18:14:40 +0200 Subject: [PATCH 2/2] remove unused code --- src/util/expandaln.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp index ba616dcfe..f6622e76c 100644 --- a/src/util/expandaln.cpp +++ b/src/util/expandaln.cpp @@ -75,14 +75,6 @@ void rescoreResultByBacktrace(Matcher::result_t &result, Sequence &qSeq, Sequenc result.seqId = identities; } -static bool compareHitsByKeyScore(const Matcher::result_t &first, const Matcher::result_t &second) { - if (first.score > second.score) - return true; - if (second.score > first.score) - return false; - return false; -} - int expandaln(int argc, const char **argv, const Command& command, bool returnAlnRes) { Parameters &par = Parameters::getInstance(); // default for expand2profile to filter MSA @@ -305,7 +297,6 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl } subSeqSet.clear(); } - //std::stable_sort(resultsBc.begin(), resultsBc.end(), compareHitsByKeyScore); for (size_t k = 0; k < resultsBc.size(); ++k) { Matcher::result_t &resultBc = resultsBc[k];