From 9ebc2455224fd283b692654aefc3c9262dfeb3f2 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 24 Mar 2021 14:15:56 +0000 Subject: [PATCH 01/17] Don't pass -m32 to GCC for ARM builds --- build.linux32ARMv6/squeak.cog.spur/build.assert/mvm | 2 +- build.linux32ARMv6/squeak.cog.spur/build.debug/mvm | 2 +- build.linux32ARMv6/squeak.cog.spur/build/mvm | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm b/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm index 0254afa9f4..0e28f57503 100755 --- a/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm +++ b/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm @@ -2,7 +2,7 @@ set -e # assert VM with VM profiler and threaded heartbeat INSTALLDIR=assert/sqcogspurlinuxhtRPi -OPT="-m32 -g3 -O1 -DDEBUGVM=0" +OPT="-g3 -O1 -DDEBUGVM=0" if [ $# -ge 1 ]; then INSTALLDIR="$1"; shift diff --git a/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm b/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm index 764e80f084..c35c5e9f6f 100755 --- a/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm +++ b/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm @@ -2,7 +2,7 @@ set -e # debug Spur VM with VM profiler and threaded heartbeat INSTALLDIR=debug/sqcogspurlinuxhtRPi -OPT="-m32 -g3 -O0 -DDEBUGVM=1" +OPT="-g3 -O0 -DDEBUGVM=1" if [ $# -ge 1 ]; then INSTALLDIR="$1"; shift diff --git a/build.linux32ARMv6/squeak.cog.spur/build/mvm b/build.linux32ARMv6/squeak.cog.spur/build/mvm index e1fd0f57bc..f16d72208d 100755 --- a/build.linux32ARMv6/squeak.cog.spur/build/mvm +++ b/build.linux32ARMv6/squeak.cog.spur/build/mvm @@ -2,7 +2,7 @@ set -e # Spur VM with VM profiler and threaded heartbeat INSTALLDIR=sqcogspurlinuxhtRPi -OPT="-m32 -g -O2 -DNDEBUG -DDEBUGVM=0" +OPT="-g -O2 -DNDEBUG -DDEBUGVM=0" if [ $# -ge 1 ]; then INSTALLDIR="$1"; shift From 38c42831095964b612a39fdb700de5d6802f6777 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 28 Apr 2021 17:09:20 +0100 Subject: [PATCH 02/17] Correct various "#if ENABLE_FAST_BLT" to "#ifdef" ENABLE_FAST_BLT is typically not assigned a value even when it is defined, so "#if" form is tecnically a syntax error. --- src/plugins/BitBltPlugin/BitBltPlugin.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c index f9fa00d119..7c7d952f01 100644 --- a/src/plugins/BitBltPlugin/BitBltPlugin.c +++ b/src/plugins/BitBltPlugin/BitBltPlugin.c @@ -1368,7 +1368,7 @@ copyBits(void) if (!(lockSurfaces())) { return primitiveFail(); } -# if ENABLE_FAST_BLT +# ifdef ENABLE_FAST_BLT /* you really, really mustn't call this unless you have the rest of the code to link to */ copyBitsFastPathSpecialised(); @@ -1390,7 +1390,7 @@ static sqInt copyBitsFastPathSpecialised(void) { -# if ENABLE_FAST_BLT +# ifdef ENABLE_FAST_BLT /* set the affected area to 0 first */ affectedL = (affectedR = (affectedT = (affectedB = 0))); @@ -1814,7 +1814,7 @@ copyBitsFallback(operation_t *op, unsigned int flags) sqInt t; -# if ENABLE_FAST_BLT +# ifdef ENABLE_FAST_BLT /* recover values from the operation struct used by the fast ARM code */ @@ -3159,7 +3159,7 @@ initialiseModule(void) { initBBOpTable(); initDither8Lookup(); -# if ENABLE_FAST_BLT +# ifdef ENABLE_FAST_BLT initialiseCopyBits(); # endif return 1; @@ -5192,7 +5192,7 @@ primitiveCompareColors(void) if (failed()) { return null; } -# if ENABLE_FAST_BLT +# ifdef ENABLE_FAST_BLT if (!(loadBitBltFromwarping(rcvr, 0))) { return primitiveFail(); } From 43ce97589b7b38fc80bbe6e8151810e7954b676e Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 24 Mar 2021 15:11:05 +0000 Subject: [PATCH 03/17] Don't assume sourcePPW is valid on entry to copyBitsFallback This is not the case when being called from "fuzz" or "bench" test applications. It may also not be accurate if a fast path has been synthesised from a combination of copyBitsFallback and one or more other fast paths. --- src/plugins/BitBltPlugin/BitBltPlugin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c index 7c7d952f01..482e8b5de1 100644 --- a/src/plugins/BitBltPlugin/BitBltPlugin.c +++ b/src/plugins/BitBltPlugin/BitBltPlugin.c @@ -1852,6 +1852,7 @@ copyBitsFallback(operation_t *op, unsigned int flags) ungammaLookupTable = (void *) op->opt.componentAlpha.ungammaLookupTable; } destPPW = 32 / destDepth; + sourcePPW = 32 / sourceDepth; cmBitsPerColor = 0; if (cmMask == 0x1FF) { cmBitsPerColor = 3; From df667cab1cbab9829ef426a4a6865e48f6b7d57f Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 24 Mar 2021 17:14:55 +0000 Subject: [PATCH 04/17] Fallback routines need extra help to detect intra-image operations In some places, sourceForm and destForm were being compared to determine which code path to follow. However, when being called from fuzz or other test tools, these structures aren't used to pass parameters, so the pointers haven't been initialised and default to 0, so the wrong code path is followed. Detect such cases and initialise them from sourceBits and destBits instead, since these will perform the same under equality tests. --- src/plugins/BitBltPlugin/BitBltPlugin.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c index 482e8b5de1..05a757d0c5 100644 --- a/src/plugins/BitBltPlugin/BitBltPlugin.c +++ b/src/plugins/BitBltPlugin/BitBltPlugin.c @@ -1863,6 +1863,14 @@ copyBitsFallback(operation_t *op, unsigned int flags) if (cmMask == 0x7FFF) { cmBitsPerColor = 5; } + /* In some places, sourceForm and destForm are compared in order to detect + * whether we're reading and writing the same image. However, these have + * not always been initialised by the time we get here, so substitute + * sourceBits and destBits if so. */ + if (sourceForm == 0 && destForm == 0) { + sourceForm = sourceBits; + destForm = destBits; + } /* begin tryCopyingBitsQuickly */ if (noSource) { done = 0; From 51df83e0b30106b2c0958945f7eb0486b6510709 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Thu, 25 Mar 2021 15:42:39 +0000 Subject: [PATCH 05/17] Remove invalid shortcut in rgbComponentAlphawith This shortcut is triggered more frequently than it used to be, due to improvements in copyLoop() that avoid buffer overruns. --- src/plugins/BitBltPlugin/BitBltPlugin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c index 05a757d0c5..428eeecbf4 100644 --- a/src/plugins/BitBltPlugin/BitBltPlugin.c +++ b/src/plugins/BitBltPlugin/BitBltPlugin.c @@ -6392,9 +6392,13 @@ rgbComponentAlphawith(sqInt sourceWord, sqInt destinationWord) sqInt v; alpha = sourceWord; +#if 0 + /* This is not a valid optimisation because alpha == 0 can change the destination + * due to rounding errors in blending and/or in gamma lookup round-trip */ if (alpha == 0) { return destinationWord; } +#endif /* begin partitionedRgbComponentAlpha:dest:nBits:nPartitions: */ /* partition mask starts at the right */ From a64c5b69b9f93a03ac2146799e67fe726943d684 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Thu, 8 Apr 2021 10:33:39 +0100 Subject: [PATCH 06/17] Fix bug in 32-bit ARM fast paths When classed as "wide" because each line is long enough to warrant pipelined prefetching as we go along, the inner loop is unrolled enough that there is at least one prefetch instruction per iteration. Loading the source image can only be done in atoms of 32 bit words due to big-endian packing, so when destination pixels are 8 times wider (or more) than source pixels, the loads happen less frequently than the store atoms (quadwords) and a conditional branch per subblock is required to decide whether to do a load or not, depending on the skew and the number of pixels remaining to process. The 'x' register is only updated once per loop, so an assembly-time constant derived from the unrolling subblock number needs to be factored in, but since the number of pixels remaining decreases as the subblock number increases, this should have been a subtraction. In practice, since only the least-significant bits of the result matter, addition and subtraction behave the same when the source:destination pixel ratio is 8, so the only operations affected were 1->16bpp, 2->32bpp and 1->32bpp. The exact threshold that counts as "wide" depends on the prefetch distance that was selected empirically, but typically would require an operation that is several hundreds of pixels wide. --- platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr index efe013cea7..a48a13c4b8 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr @@ -1542,7 +1542,7 @@ subblock SETA 0 WHILE subblock < pix_per_block*dst_w_bpp/128 [ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0 ASSERT flags & FLAG_MAX_256BIT_MACRO = 0 - AddL scratch, x, (prefetch_distance+2)*pix_per_block + subblock*128/dst_w_bpp + AddL scratch, x, (prefetch_distance+2)*pix_per_block - subblock*128/dst_w_bpp TST scratch, #32/src_bpp - 128/dst_w_bpp BNE %FT53 Read1Word src, 0, carry, &$fixed_skew, skew, scratch From b577ab2e4dc476c3ee740fa54af755d864995f42 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 13 Apr 2021 12:24:26 +0100 Subject: [PATCH 07/17] Fix buffer overflow bugs In fastPathDepthConv (which combines sourceWord colour-depth conversion with another fast path for another combinationRule at a constant colour depth) and fastPathRightToLeft, it could overflow the temporary buffer and thereby corrupt other local variables if the last chunk of a pixel row was 2048 bytes (or just under). This was most likely to happen with 32bpp destination images and widths of about 512 pixels. --- .../Cross/plugins/BitBltPlugin/BitBltGeneric.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index 762ce7780f..cb474b1f95 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -260,8 +260,13 @@ static void fastPathRightToLeft(operation_t *op, uint32_t flags) uint32_t width = op->width; uint32_t height = op->height; - uint8_t tempBuffer[CACHELINE_LEN * 64]; -#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN)) +#define BUFFER_LEN_CACHELINES 64 + /* Need enough room to be able to quickly find BUFFER_LEN_CACHELINES + * starting at any cacheline alignment + */ + uint8_t spaceForTempBuffer[CACHELINE_LEN * (BUFFER_LEN_CACHELINES + 2)]; + uint8_t *tempBuffer = (uint8_t *)(((uintptr_t) spaceForTempBuffer + CACHELINE_LEN - 1) &~ (CACHELINE_LEN - 1)); +#define BUFFER_LEN_PIXELS (cacheline_len * BUFFER_LEN_CACHELINES) opFromSrc.dest.bits = tempBuffer; opFromSrc.dest.y = 0; opFromSrc.height = 1; @@ -377,8 +382,13 @@ static void fastPathDepthConv(operation_t *op, uint32_t flags) uint32_t width = op->width; uint32_t height = op->height; - uint8_t tempBuffer[CACHELINE_LEN * 64]; -#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN)) +#define BUFFER_LEN_CACHELINES 64 + /* Need enough room to be able to quickly find BUFFER_LEN_CACHELINES + * starting at any cacheline alignment + */ + uint8_t spaceForTempBuffer[CACHELINE_LEN * (BUFFER_LEN_CACHELINES + 2)]; + uint8_t *tempBuffer = (uint8_t *)(((uintptr_t) spaceForTempBuffer + CACHELINE_LEN - 1) &~ (CACHELINE_LEN - 1)); +#define BUFFER_LEN_PIXELS (cacheline_len * BUFFER_LEN_CACHELINES) opFromSrc.combinationRule = CR_sourceWord; opFromSrc.dest.bits = tempBuffer; opFromSrc.dest.y = 0; From 9084c1757bccb2216dc5e704743132f2df9f854a Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 13 Apr 2021 17:20:54 +0100 Subject: [PATCH 08/17] Fix corruption bugs with wide 1bpp source images For images that were wide enough to invoke intra-line preloads, there was a register clash between the preload address calculation and one of the registers holding the deskewed source pixels (this only occurred once per destination cacheline). --- .../BitBltPlugin/BitBltArmSimdSourceWord.s | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s index 27233f5d5f..ec5b1737b9 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s @@ -528,7 +528,7 @@ counter SETA counter + 4 SourceWord GenerateFunctions 1, 4,, \ FLAG_COLOUR_MAP :OR: FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS :OR: FLAG_NO_EXPAND_SKEW, 2, \ - "stride_s,map,bitptrs,orig_w,scratch", \ + "stride_s,map,bitptrs,scratch,orig_w", \ "x,stride_s,bitptrs", orig_w,, init ; leading_pixels_reg=wk3 ; ******************************************************************** @@ -951,7 +951,7 @@ counter SETA counter + 4 MACRO SourceWord1_2_128bits_head $src, $fixed_skew, $intra_preloads - Read2Words src, 3, carry, $fixed_skew, skew, $wk0 + Read2Words src, 2, carry, $fixed_skew, skew, $wk0 MEND MACRO @@ -959,8 +959,8 @@ counter SETA counter + 4 LCLA counter counter SETA 0 WHILE counter < 16 - MSR CPSR_f, $wk3 - MOV $wk3, $wk3, LSL #4 + MSR CPSR_f, $wk2 + MOV $wk2, $wk2, LSL #4 ORRPL $wk0, ht, $wk0, LSL #2 ORRMI $wk0, ht_info, $wk0, LSL #2 ORRNE $wk0, ht, $wk0, LSL #2 @@ -973,8 +973,8 @@ counter SETA counter + 4 WEND counter SETA 0 WHILE counter < 16 - MSR CPSR_f, $wk3 - MOV $wk3, $wk3, LSL #4 + MSR CPSR_f, $wk2 + MOV $wk2, $wk2, LSL #4 ORRPL $wk1, ht, $wk1, LSL #2 ORRMI $wk1, ht_info, $wk1, LSL #2 ORRNE $wk1, ht, $wk1, LSL #2 @@ -987,8 +987,13 @@ counter SETA counter + 4 WEND counter SETA 0 WHILE counter < 16 + [ counter = 0 + MSR CPSR_f, $wk3 + MOV $wk4, $wk3, LSL #4 + | MSR CPSR_f, $wk4 MOV $wk4, $wk4, LSL #4 + ] ORRPL $wk2, ht, $wk2, LSL #2 ORRMI $wk2, ht_info, $wk2, LSL #2 ORRNE $wk2, ht, $wk2, LSL #2 From 2b0279ab33949f75fb192a37864d188d219c461c Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Mon, 29 Mar 2021 15:51:51 +0100 Subject: [PATCH 09/17] Fix type of halftone array for 64-bit targets The halftone array is accessed using a hard-coded multiplier of 4 bytes, therefore the type of each element needs to be 32 bit on every platform. `sqInt` is not appropriate for this use, since it is a 64-bit type on 64-bit platforms. Rather than unilaterally introduce C99 stdint types, use `unsigned int` since this wil be 32-bit on both current fast path binary targets. --- platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h index 78fbfdc059..136b2407e5 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h @@ -116,7 +116,7 @@ typedef struct { unsigned int (*cmLookupTable)[]; bool noHalftone; usqInt halftoneHeight; - sqInt (*halftoneBase)[]; + unsigned int (*halftoneBase)[]; union { sqInt sourceAlpha; struct { From e22ae0b9cf2a4b42c99dc2471c4937eb40c39cad Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 27 Apr 2021 12:25:01 +0100 Subject: [PATCH 10/17] Detect and add a new fast path flag for effective-1bpp colour maps Sometimes, colour maps are used such that all entries except the first contain the same value. Combined with the fact that only source colour 0 uses colour map entry 0 (any other colours for which all non-0 bits would otherwise be discarded during index generation are forced to use entry 1 instead), this effectively acts as a 2-entry (or 1bpp) map, depending on whether the source colour is 0 or not. This is far more efficiently coded in any fast path by a test against zero, than by a table lookup - it frees up 2 KB, 16 KB or 128 KB of data cache space, depending on whether a 9-, 12- or 15-bit colour map was used. There is an up-front cost to scanning the colour map to see if its entries are of this nature, however in most "normal" colour maps, this scan will rapidly be aborted. --- .../plugins/BitBltPlugin/BitBltDispatch.c | 71 +++++++++++-------- .../plugins/BitBltPlugin/BitBltInternal.h | 32 +++++---- 2 files changed, 60 insertions(+), 43 deletions(-) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c index 6065dbab83..10a762be43 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c @@ -310,41 +310,56 @@ void copyBitsDispatch(operation_t *op) if (op->cmFlags & ColorMapIndexedPart) { if (op->cmFlags & ColorMapFixedPart) { - if (op->src.depth == 32) { - if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0) - flags |= FAST_PATH_15BIT_COLOR_MAP; - else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0) - flags |= FAST_PATH_12BIT_COLOR_MAP; - else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0) - flags |= FAST_PATH_9BIT_COLOR_MAP; - else { - /* Unsupported case 1 */ - copyBitsFallback(op, 0); + /* First check whether all except the first colour map entry match. + * This indicates that the operation depends only upon whether + * each pixel has value 0 or not (note that non-zero pixels that + * reduce to 0 when bit-packed use lookup index 1) + */ + usqInt i; + flags |= FAST_PATH_1BIT_COLOR_MAP; + for (i = op->cmMask; i >= 2; --i) { + if ((*op->cmLookupTable)[i] != (*op->cmLookupTable)[1]) { + flags &= ~FAST_PATH_1BIT_COLOR_MAP; + break; + } + } + if ((flags & FAST_PATH_1BIT_COLOR_MAP) == 0) { + if (op->src.depth == 32) { + if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0) + flags |= FAST_PATH_15BIT_COLOR_MAP; + else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0) + flags |= FAST_PATH_12BIT_COLOR_MAP; + else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0) + flags |= FAST_PATH_9BIT_COLOR_MAP; + else { + /* Unsupported case 1 */ + copyBitsFallback(op, 0); #ifdef PROFILING - profile_unrecorded_cases[1]++; + profile_unrecorded_cases[1]++; #endif - return; - } - } else if (op->src.depth == 16) { - if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0) - flags |= FAST_PATH_12BIT_COLOR_MAP; - else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0) - flags |= FAST_PATH_9BIT_COLOR_MAP; - else { - /* Unsupported case 2 */ + return; + } + } else if (op->src.depth == 16) { + if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0) + flags |= FAST_PATH_12BIT_COLOR_MAP; + else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0) + flags |= FAST_PATH_9BIT_COLOR_MAP; + else { + /* Unsupported case 2 */ + copyBitsFallback(op, 0); +#ifdef PROFILING + profile_unrecorded_cases[2]++; +#endif + return; + } + } else { + /* Unsupported case 3 */ copyBitsFallback(op, 0); #ifdef PROFILING - profile_unrecorded_cases[2]++; + profile_unrecorded_cases[3]++; #endif return; } - } else { - /* Unsupported case 3 */ - copyBitsFallback(op, 0); -#ifdef PROFILING - profile_unrecorded_cases[3]++; -#endif - return; } } else { if ((op->src.depth < 16 && op->cmMask == (1u << op->src.depth) - 1) || diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h b/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h index a653fb64c1..f82cb1d24e 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h @@ -62,21 +62,22 @@ enum { FAST_PATH_DEST_LITTLE_ENDIAN = 1u<<16, FAST_PATH_NO_COLOR_MAP = 1u<<17, - FAST_PATH_9BIT_COLOR_MAP = 1u<<18, - FAST_PATH_12BIT_COLOR_MAP = 1u<<19, - FAST_PATH_15BIT_COLOR_MAP = 1u<<20, + FAST_PATH_1BIT_COLOR_MAP = 1u<<18, + FAST_PATH_9BIT_COLOR_MAP = 1u<<19, + FAST_PATH_12BIT_COLOR_MAP = 1u<<20, + FAST_PATH_15BIT_COLOR_MAP = 1u<<21, FAST_PATH_DIRECT_COLOR_MAP = FAST_PATH_15BIT_COLOR_MAP, /* for use with <16bpp */ - FAST_PATH_NO_HALFTONE = 1u<<21, - FAST_PATH_SCALAR_HALFTONE = 1u<<22, - FAST_PATH_VECTOR_HALFTONE = 1u<<23, + FAST_PATH_NO_HALFTONE = 1u<<22, + FAST_PATH_SCALAR_HALFTONE = 1u<<23, + FAST_PATH_VECTOR_HALFTONE = 1u<<24, - FAST_PATH_CA_NO_GAMMA = 1u<<24, - FAST_PATH_CA_HAS_GAMMA = 1u<<25, + FAST_PATH_CA_NO_GAMMA = 1u<<25, + FAST_PATH_CA_HAS_GAMMA = 1u<<26, - FAST_PATH_NO_OVERLAP = 1u<<26, - FAST_PATH_H_OVERLAP = 1u<<27, - FAST_PATH_V_OVERLAP = 1u<<28, + FAST_PATH_NO_OVERLAP = 1u<<27, + FAST_PATH_H_OVERLAP = 1u<<28, + FAST_PATH_V_OVERLAP = 1u<<29, }; /* These are the derived macros for use in specifying fast paths. */ @@ -102,10 +103,11 @@ enum { #define ONLY_DEST_BIG_ENDIAN (FAST_PATH_DEST_LITTLE_ENDIAN) #define ONLY_DEST_LITTLE_ENDIAN (FAST_PATH_DEST_LITTLE_ENDIAN) -#define ONLY_NO_COLOR_MAP (FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) -#define ONLY_9BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) -#define ONLY_12BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) -#define ONLY_15BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP) +#define ONLY_NO_COLOR_MAP (FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) +#define ONLY_1BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) +#define ONLY_9BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) +#define ONLY_12BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP) +#define ONLY_15BIT_COLOR_MAP (FAST_PATH_NO_COLOR_MAP | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP) #define ONLY_DIRECT_COLOR_MAP ONLY_15BIT_COLOR_MAP /* for use with <16bpp */ #define ONLY_NO_HALFTONE (FAST_PATH_SCALAR_HALFTONE | FAST_PATH_VECTOR_HALFTONE) From e44e2c89783fe8b46d9ee7eec1242bfb97f91dfb Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 31 Mar 2021 12:01:47 +0100 Subject: [PATCH 11/17] C fast path for 32bpp alphaBlend This runs approx 2.6x faster when benchmarked on Cortex-A72 in AArch64. --- .../plugins/BitBltPlugin/BitBltGeneric.c | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index cb474b1f95..526ab7fc8d 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -205,6 +205,37 @@ static void fastPathSourceWord32_32(operation_t *op, uint32_t flags) } while (--height > 0); } +static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + uint32_t *src = srcBits + srcPitch * srcY + srcX; + uint32_t *dest = destBits + destPitch * destY + destX; + srcPitch -= width; + destPitch -= width; + do { + uint32_t remain = width; + do { + uint32_t s = *src++; + uint32_t d = *dest; + unsigned int alpha = (s >> 24) & 0xFF; + unsigned int unAlpha = 0xFF - alpha; + uint32_t sAG = ((s >> 8) & 0xFF) | 0xFF0000; + uint32_t sRB = s & 0xFF00FF; + uint32_t dAG = (d >> 8) & 0xFF00FF; + uint32_t dRB = d & 0xFF00FF; + uint32_t blendAG = sAG * alpha + dAG * unAlpha + 0xFF00FF; + uint32_t blendRB = sRB * alpha + dRB * unAlpha + 0xFF00FF; + blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF; + blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF; + d = (blendAG << 8) | blendRB; + *dest++ = d; + } while (--remain > 0); + src += srcPitch; + dest += destPitch; + } while (--height > 0); +} + static void fastPathRightToLeft(operation_t *op, uint32_t flags) { @@ -453,6 +484,7 @@ static fast_path_t fastPaths[] = { { fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) }, { fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP }, + { fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, { fastPathNoOp, CR_destinationWord, 0 }, /* Some special fast paths to extend the abilities of the others in corner cases */ From 45649aa5af90d36df13dd574e7ef31148b732035 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Fri, 23 Apr 2021 13:39:05 +0100 Subject: [PATCH 12/17] C fast path for planar alphaBlend --- .../plugins/BitBltPlugin/BitBltGeneric.c | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index 526ab7fc8d..616ff78a91 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -205,6 +205,36 @@ static void fastPathSourceWord32_32(operation_t *op, uint32_t flags) } while (--height > 0); } +static void fastPathAlphaBlend0_32_scalar(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + uint32_t *dest = destBits + destPitch * destY + destX; + destPitch -= width; + uint32_t s = (*op->halftoneBase)[0]; + unsigned int alpha = (s >> 24) & 0xFF; + unsigned int unAlpha = 0xFF - alpha; + uint32_t sAG = ((s >> 8) & 0xFF) | 0xFF0000; + uint32_t sRB = s & 0xFF00FF; + sAG *= alpha; + sRB *= alpha; + do { + uint32_t remain = width; + do { + uint32_t d = *dest; + uint32_t dAG = (d >> 8) & 0xFF00FF; + uint32_t dRB = d & 0xFF00FF; + uint32_t blendAG = sAG + dAG * unAlpha + 0xFF00FF; + uint32_t blendRB = sRB + dRB * unAlpha + 0xFF00FF; + blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF; + blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF; + d = (blendAG << 8) | blendRB; + *dest++ = d; + } while (--remain > 0); + dest += destPitch; + } while (--height > 0); +} + static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags) { IGNORE(flags); @@ -484,6 +514,7 @@ static fast_path_t fastPaths[] = { { fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) }, { fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP }, + { fastPathAlphaBlend0_32_scalar, CR_alphaBlend, STD_FLAGS_NO_SOURCE(32,SCALAR) }, { fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, { fastPathNoOp, CR_destinationWord, 0 }, From 405f35babf738496b28b0849fc7086b078358e46 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 14 Apr 2021 15:20:14 +0100 Subject: [PATCH 13/17] C fast path for 8->32bpp conversion --- .../plugins/BitBltPlugin/BitBltGeneric.c | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index 616ff78a91..04d255440f 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -192,6 +192,109 @@ static void fastPathSourceWord0_32_scalar(operation_t *op, uint32_t flags) } while (--height > 0); } +static void fastPathSourceWord8_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint8_t, uint32_t); + uint8_t *src = srcBits + srcPitch * srcY + (srcX ^ 3); + uint32_t *dest = destBits + destPitch * destY + destX; + switch ((srcX + width) & 3) { + case 0: + do { + int32_t x = width; + uint8_t *s = src; + if ((x & 3) >= 3) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 2) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 1) + { + *dest++ = (*cmLookupTable)[*s]; + s += 7; + } + for (x -= 4; x >= 0; x -= 4) { + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s]; + s += 7; + } + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); + break; + case 1: + do { + int32_t x = width; + uint8_t *s = src; + if ((x & 3) >= 3) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 2) + { + *dest++ = (*cmLookupTable)[*s]; + s += 7; + } + if ((x & 3) >= 1) + *dest++ = (*cmLookupTable)[*s--]; + for (x -= 4; x >= 0; x -= 4) { + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s]; + s += 7; + *dest++ = (*cmLookupTable)[*s--]; + } + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); + break; + case 2: + do { + int32_t x = width; + uint8_t *s = src; + if ((x & 3) >= 3) + { + *dest++ = (*cmLookupTable)[*s]; + s += 7; + } + if ((x & 3) >= 2) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 1) + *dest++ = (*cmLookupTable)[*s--]; + for (x -= 4; x >= 0; x -= 4) { + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s]; + s += 7; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + } + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); + break; + case 3: + do { + int32_t x = width; + uint8_t *s = src; + if ((x & 3) >= 3) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 2) + *dest++ = (*cmLookupTable)[*s--]; + if ((x & 3) >= 1) + *dest++ = (*cmLookupTable)[*s--]; + for (x -= 4; x >= 0; x -= 4) { + *dest++ = (*cmLookupTable)[*s]; + s += 7; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + *dest++ = (*cmLookupTable)[*s--]; + } + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); + break; + } +} + static void fastPathSourceWord32_32(operation_t *op, uint32_t flags) { IGNORE(flags); @@ -513,6 +616,7 @@ static fast_path_t fastPaths[] = { { fastPathClearWord32, CR_clearWord, STD_FLAGS_NO_SOURCE(32,NO) }, { fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) }, + { fastPathSourceWord8_32, CR_sourceWord, STD_FLAGS(8,32,DIRECT,NO) }, { fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP }, { fastPathAlphaBlend0_32_scalar, CR_alphaBlend, STD_FLAGS_NO_SOURCE(32,SCALAR) }, { fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, From dac723ffd268789b93124573669e541dee9616e1 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 27 Apr 2021 12:35:49 +0100 Subject: [PATCH 14/17] C fast path for alphaBlend with 1bpp colour map and scalar halftone --- .../plugins/BitBltPlugin/BitBltGeneric.c | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index 04d255440f..e32b543d7b 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -338,6 +338,42 @@ static void fastPathAlphaBlend0_32_scalar(operation_t *op, uint32_t flags) } while (--height > 0); } +static void fastPathAlphaBlend32_32_map1_scalar(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + uint32_t *src = srcBits + srcPitch * srcY + srcX; + uint32_t *dest = destBits + destPitch * destY + destX; + srcPitch -= width; + destPitch -= width; + uint32_t s[2] = { (*op->cmLookupTable)[0] & (*op->halftoneBase)[0], (*op->cmLookupTable)[1] & (*op->halftoneBase)[0] }; + unsigned int alpha[2] = { (s[0] >> 24) & 0xFF, (s[1] >> 24) & 0xFF }; + unsigned int unAlpha[2] = { 0xFF - alpha[0], 0xFF - alpha[1] }; + uint32_t sAG[2] = { ((s[0] >> 8) & 0xFF) | 0xFF0000, ((s[1] >> 8) & 0xFF) | 0xFF0000 }; + uint32_t sRB[2] = { s[0] & 0xFF00FF, s[1] & 0xFF00FF }; + sAG[0] *= alpha[0]; + sAG[1] *= alpha[1]; + sRB[0] *= alpha[0]; + sRB[1] *= alpha[1]; + do { + uint32_t remain = width; + do { + uint32_t s = !!*src++; + uint32_t d = *dest; + uint32_t dAG = (d >> 8) & 0xFF00FF; + uint32_t dRB = d & 0xFF00FF; + uint32_t blendAG = sAG[s] + dAG * unAlpha[s] + 0xFF00FF; + uint32_t blendRB = sRB[s] + dRB * unAlpha[s] + 0xFF00FF; + blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF; + blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF; + d = (blendAG << 8) | blendRB; + *dest++ = d; + } while (--remain > 0); + src += srcPitch; + dest += destPitch; + } while (--height > 0); +} + static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags) { IGNORE(flags); @@ -619,6 +655,7 @@ static fast_path_t fastPaths[] = { { fastPathSourceWord8_32, CR_sourceWord, STD_FLAGS(8,32,DIRECT,NO) }, { fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP }, { fastPathAlphaBlend0_32_scalar, CR_alphaBlend, STD_FLAGS_NO_SOURCE(32,SCALAR) }, + { fastPathAlphaBlend32_32_map1_scalar, CR_alphaBlend, STD_FLAGS(32,32,1BIT,SCALAR) }, { fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, { fastPathNoOp, CR_destinationWord, 0 }, From 80cd2da48177ef814c9ce3905ee66866a9b2fbc0 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 28 Apr 2021 16:32:56 +0100 Subject: [PATCH 15/17] Apply scalar halftoning to colour map entries instead for 32bpp destination This makes better use of existing fast paths, and applies to all platforms. --- .../plugins/BitBltPlugin/BitBltGeneric.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c index e32b543d7b..677c886688 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c @@ -640,6 +640,30 @@ static void fastPathDepthConv(operation_t *op, uint32_t flags) } while (--height > 0); } +static void fastPathEarlyHalftone(operation_t *op, uint32_t flags) +{ + /* With a 32bpp destination, the action of halftoning has no dependency on + * the destination X coordinate of a pixel. Furthermore, scalar halftoning + * doesn't depend on the destination Y coordinate either. If there's a + * colour map as well, this means we can apply the halftone to the colour + * map entries instead, which makes it easier to find a match for the + * remaining part of the operation from amongst the available fast paths. + */ + uint32_t flags2 = (flags &~ FAST_PATH_SCALAR_HALFTONE) | FAST_PATH_NO_HALFTONE; + void (*func)(operation_t *, uint32_t) = lookupFastPath(op->combinationRule, flags2); + if (func == NULL) { + copyBitsFallback(op, flags); + } else { + uint32_t cmLookupTable2[1<<15]; + for (int32_t i = op->cmMask; i >= 0; --i) + cmLookupTable2[i] = (*op->cmLookupTable)[i] & (*op->halftoneBase)[0]; + operation_t op2 = *op; + op2.cmLookupTable = &cmLookupTable2; + op2.noHalftone = true; + func(&op2, flags2); + } +} + static void fastPathNoOp(operation_t *op, uint32_t flags) { IGNORE(op); @@ -662,6 +686,7 @@ static fast_path_t fastPaths[] = { /* Some special fast paths to extend the abilities of the others in corner cases */ { fastPathRightToLeft, CR_any, FAST_PATH_VECTOR_HALFTONE | ONLY_H_OVERLAP }, { fastPathBottomToTop, CR_any, FAST_PATH_VECTOR_HALFTONE | ONLY_V_OVERLAP }, + { fastPathEarlyHalftone, CR_any, FAST_PATH_SRC_0BPP | ONLY_DEST_32BPP | FAST_PATH_NO_COLOR_MAP | ONLY_SCALAR_HALFTONE }, { fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_32BPP | ONLY_DEST_32BPP }, { fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_16BPP | ONLY_DEST_16BPP }, { fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_8BPP | ONLY_DEST_8BPP }, From e4a27ec89afebd9a88dc3ab07992b7daab0940cc Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 24 Mar 2021 14:08:15 +0000 Subject: [PATCH 16/17] Enable fast blit code for AArch64 --- build.linux64ARMv8/squeak.cog.spur/build.assert/mvm | 1 + build.linux64ARMv8/squeak.cog.spur/build.debug/mvm | 1 + build.linux64ARMv8/squeak.cog.spur/build/mvm | 1 + platforms/unix/config/configure | 12 ++++++++++++ platforms/unix/plugins/BitBltPlugin/acinclude.m4 | 11 +++++++++++ 5 files changed, 26 insertions(+) diff --git a/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm b/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm index 5d272d3adf..6be9484c68 100755 --- a/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm +++ b/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm @@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp - test -f config.h || ../../../platforms/unix/config/configure \ --with-vmversion=5.0 --with-src=spur64src \ --without-vm-display-fbdev --without-npsqueak \ + --enable-fast-bitblt \ CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \ LIBS="-lrt" diff --git a/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm b/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm index 6227b8afe0..abe003b808 100755 --- a/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm +++ b/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm @@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp - test -f config.h || ../../../platforms/unix/config/configure \ --with-vmversion=5.0 --with-src=spur64src \ --without-vm-display-fbdev --without-npsqueak \ + --enable-fast-bitblt \ CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \ LIBS="-lrt" diff --git a/build.linux64ARMv8/squeak.cog.spur/build/mvm b/build.linux64ARMv8/squeak.cog.spur/build/mvm index b2bcf24c4f..63f42b7253 100755 --- a/build.linux64ARMv8/squeak.cog.spur/build/mvm +++ b/build.linux64ARMv8/squeak.cog.spur/build/mvm @@ -22,6 +22,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp - test -f config.h || ../../../platforms/unix/config/configure \ --with-vmversion=5.0 --with-src=spur64src \ --without-npsqueak \ + --enable-fast-bitblt \ CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \ LIBS="-lrt" ## --without-vm-display-fbdev --without-npsqueak \ diff --git a/platforms/unix/config/configure b/platforms/unix/config/configure index 7b7c747b75..a2f424d486 100755 --- a/platforms/unix/config/configure +++ b/platforms/unix/config/configure @@ -18395,6 +18395,18 @@ if test "${enable_fast_bitblt+set}" = set; then : fi +;; + +aarch64) +# Check whether --enable-fast-bitblt was given. +if test "${enable_fast_bitblt+set}" = set; then : + enableval=$enable_fast_bitblt; if test "x$enableval" = "xyes" ; then + bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o" + bitblt_flags="-DENABLE_FAST_BLT" + fi + +fi + ;; esac diff --git a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 b/platforms/unix/plugins/BitBltPlugin/acinclude.m4 index a999922c49..146b16acee 100644 --- a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 +++ b/platforms/unix/plugins/BitBltPlugin/acinclude.m4 @@ -17,6 +17,17 @@ AC_ARG_ENABLE(fast-bitblt, ], []) ;; + +aarch64) +AC_ARG_ENABLE(fast-bitblt, + [ --enable-fast-bitblt enable fast BitBlt optimizations (default=no)], + [ if test "x$enableval" = "xyes" ; then + bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o" + bitblt_flags="-DENABLE_FAST_BLT" + fi + ], + []) +;; esac AC_SUBST(BITBLT_OBJS, $bitblt_objs) From 10d8a11296926346152fa2556864201ee7a0e166 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 31 Mar 2021 12:04:45 +0100 Subject: [PATCH 17/17] AArch64 assembly optimisations --- .../Cross/plugins/BitBltPlugin/BitBltArm64.c | 2482 +++++++++++++++++ .../Cross/plugins/BitBltPlugin/BitBltArm64.h | 30 + .../plugins/BitBltPlugin/BitBltDispatch.c | 55 +- platforms/unix/config/configure | 2 +- .../unix/plugins/BitBltPlugin/acinclude.m4 | 2 +- 5 files changed, 2568 insertions(+), 3 deletions(-) create mode 100644 platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c create mode 100644 platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c new file mode 100644 index 0000000000..5ad9db04c4 --- /dev/null +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c @@ -0,0 +1,2482 @@ +/* + * Copyright © 2021 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#include +#include + +#include "BitBltInternal.h" + +#define YES(x) x +#define NO(x) + +#define LOAD_SRC_512_INTERLEAVE_YES \ + "ld4 {v0.16b-v3.16b}, [%[src]], #512/8 \n\t" \ + +#define LOAD_SRC_512_INTERLEAVE_NO \ + "ld1 {v0.16b-v3.16b}, [%[src]], #512/8 \n\t" \ + +#define LOAD_DEST_512_INTERLEAVE_YES \ + "ld4 {v4.16b-v7.16b}, [%[dest]] \n\t" \ + "prfm pstl1strm, [%[dest]] \n\t" \ + +#define LOAD_DEST_512_INTERLEAVE_NO \ + "ld1 {v4.16b-v7.16b}, [%[dest]] \n\t" \ + "prfm pstl1strm, [%[dest]] \n\t" \ + +#define STORE_DEST_512_INTERLEAVE_YES \ + "st4 {v0.16b-v3.16b}, [%[dest]] \n\t" \ + +#define STORE_DEST_512_INTERLEAVE_NO \ + "st1 {v0.16b-v3.16b}, [%[dest]] \n\t" \ + +#define LOAD_SRC_256_INTERLEAVE_YES \ + "ld4 {v0.8b-v3.8b}, [%[src]], #256/8 \n\t" \ + +#define LOAD_SRC_256_INTERLEAVE_NO \ + "ld1 {v0.8b-v3.8b}, [%[src]], #256/8 \n\t" \ + +#define LOAD_DEST_256_INTERLEAVE_YES \ + "ld4 {v4.8b-v7.8b}, [%[dest]] \n\t" \ + +#define LOAD_DEST_256_INTERLEAVE_NO \ + "ld1 {v4.8b-v7.8b}, [%[dest]] \n\t" \ + +#define STORE_DEST_256_INTERLEAVE_YES \ + "st4 {v0.8b-v3.8b}, [%[dest]] \n\t" \ + +#define STORE_DEST_256_INTERLEAVE_NO \ + "st1 {v0.8b-v3.8b}, [%[dest]] \n\t" \ + +#define UNPACK_YES(sz,rw) \ + "uzp1 v16."#sz"b, v0."#sz"b, v1."#sz"b \n\t" \ + "uzp2 v17."#sz"b, v0."#sz"b, v1."#sz"b \n\t" \ + rw("uzp1 v20."#sz"b, v4."#sz"b, v5."#sz"b \n\t")\ + rw("uzp2 v21."#sz"b, v4."#sz"b, v5."#sz"b \n\t")\ + "uzp1 v18."#sz"b, v2."#sz"b, v3."#sz"b \n\t" \ + "uzp2 v19."#sz"b, v2."#sz"b, v3."#sz"b \n\t" \ + rw("uzp1 v22."#sz"b, v6."#sz"b, v7."#sz"b \n\t")\ + rw("uzp2 v23."#sz"b, v6."#sz"b, v7."#sz"b \n\t")\ + "uzp1 v0."#sz"b, v16."#sz"b, v18."#sz"b \n\t" \ + "uzp2 v2."#sz"b, v16."#sz"b, v18."#sz"b \n\t" \ + rw("uzp1 v4."#sz"b, v20."#sz"b, v22."#sz"b \n\t")\ + rw("uzp2 v6."#sz"b, v20."#sz"b, v22."#sz"b \n\t")\ + "uzp1 v1."#sz"b, v17."#sz"b, v19."#sz"b \n\t" \ + "uzp2 v3."#sz"b, v17."#sz"b, v19."#sz"b \n\t" \ + rw("uzp1 v5."#sz"b, v21."#sz"b, v23."#sz"b \n\t")\ + rw("uzp2 v7."#sz"b, v21."#sz"b, v23."#sz"b \n\t")\ + +#define UNPACK_NO(sz,rw) /* nothing */ + +#define UNPACK_DEST_YES(sz) \ + "uzp1 v0."#sz"b, v4."#sz"b, v5."#sz"b \n\t" \ + "uzp2 v1."#sz"b, v4."#sz"b, v5."#sz"b \n\t" \ + "uzp1 v2."#sz"b, v6."#sz"b, v7."#sz"b \n\t" \ + "uzp2 v3."#sz"b, v6."#sz"b, v7."#sz"b \n\t" \ + "uzp1 v4."#sz"b, v0."#sz"b, v2."#sz"b \n\t" \ + "uzp2 v6."#sz"b, v0."#sz"b, v2."#sz"b \n\t" \ + "uzp1 v5."#sz"b, v1."#sz"b, v3."#sz"b \n\t" \ + "uzp2 v7."#sz"b, v1."#sz"b, v3."#sz"b \n\t" \ + +#define UNPACK_DEST_NO(sz) /* nothing */ + +#define PACK_YES(sz) \ + "zip1 v4."#sz"b, v0."#sz"b, v2."#sz"b \n\t" \ + "zip2 v6."#sz"b, v0."#sz"b, v2."#sz"b \n\t" \ + "zip1 v5."#sz"b, v1."#sz"b, v3."#sz"b \n\t" \ + "zip2 v7."#sz"b, v1."#sz"b, v3."#sz"b \n\t" \ + "zip1 v0."#sz"b, v4."#sz"b, v5."#sz"b \n\t" \ + "zip2 v1."#sz"b, v4."#sz"b, v5."#sz"b \n\t" \ + "zip1 v2."#sz"b, v6."#sz"b, v7."#sz"b \n\t" \ + "zip2 v3."#sz"b, v6."#sz"b, v7."#sz"b \n\t" \ + +#define PACK_NO(sz) /* nothing */ + +#define DEFINE_FAST_PATH_0_32(op,interleave,rw) \ +static void fastPath##op##_0_32(operation_t *o, uint32_t flags) \ +{ \ + op##_0_32_DECLARE_TEMPORARIES; \ + IGNORE(flags); \ + COPY_OP_TO_LOCALS(o, uint32_t, uint32_t); \ + uint32_t *dest = destBits + destPitch * destY + destX; \ + destPitch -= width; \ + __asm__ volatile ( \ + op##_0_32_INIT \ + "" \ + : /* Outputs */ \ + [dest]"+r"(dest) /* just so we can handle comma next */ \ + op##_0_32_PASS_TEMPORARIES \ + : /* Inputs */ \ + [halftone]"r"(halftoneBase) \ + ); \ + do { \ + uint32_t remain = width; \ + __asm__ volatile ( \ + "tst %[dest], #4 \n\t" \ + "b.eq 10f \n\t" \ + op##_0_32_32 \ + "sub %[x], %[x], #1 \n\t" \ + "10: \n\t" \ + "subs %[x], %[x], #512/32 \n\t" \ + "b.mi 30f \n\t" \ + "20: \n\t" \ + LOAD_DEST_512_INTERLEAVE_##interleave \ + op##_0_32_512 \ + STORE_DEST_512_INTERLEAVE_##interleave \ + "2: \n\t" \ + "add %[dest], %[dest], #512/8 \n\t" \ + "subs %[x], %[x], #512/32 \n\t" \ + "b.pl 20b \n\t" \ + "30: \n\t" \ + "add %[x], %[x], #512/32 \n\t" \ + "cmp %[x], #32/32 \n\t" \ + "b.lo 90f \n\t" \ + "cmp %[x], #256/32 \n\t" \ + "b.lo 70f \n\t" \ + "b.eq 60f \n\t" \ + "tbz %[x], #8-5, 8f \n\t" \ + "ld1 {v4.16b-v5.16b}, [%[dest]], #256/8 \n\t" \ + "8: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "ldr q6, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "ldr d7, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 5f \n\t" \ + "ldr s17, [%[dest]], #32/8 \n\t" \ + "mov v7.s[2], v17.s[0] \n\t" \ + "5: \n\t" \ + "sub %[dest], %[dest], %[x], lsl #2 \n\t" \ + UNPACK_DEST_##interleave(16) \ + op##_0_32_512 \ + PACK_##interleave(16) \ + "tbz %[x], #8-5, 8f \n\t" \ + "st1 {v0.16b-v1.16b}, [%[dest]], #256/8 \n\t" \ + "8: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "str q2, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "str d3, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 90f \n\t" \ + "mov v3.s[0], v3.s[2] \n\t" \ + "str s3, [%[dest]], #32/8 \n\t" \ + "b 90f \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], %[x], lsl #2 \n\t" \ + "b 90f \n\t" \ + "60: \n\t" \ + LOAD_DEST_256_INTERLEAVE_##interleave \ + op##_0_32_256 \ + STORE_DEST_256_INTERLEAVE_##interleave \ + "2: \n\t" \ + "add %[dest], %[dest], #256/8 \n\t" \ + "b 90f \n\t" \ + "70: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "ld1 {v4.8b-v5.8b}, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "ldr d6, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 5f \n\t" \ + "ldr s7, [%[dest]], #32/8 \n\t" \ + "5: \n\t" \ + "sub %[dest], %[dest], %[x], lsl #2 \n\t" \ + UNPACK_DEST_##interleave(8) \ + op##_0_32_256 \ + PACK_##interleave(8) \ + "tbz %[x], #7-5, 7f \n\t" \ + "st1 {v0.8b-v1.8b}, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "str d2, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 90f \n\t" \ + "str s3, [%[dest]], #32/8 \n\t" \ + "b 90f \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], %[x], lsl #2 \n\t" \ + "90: \n\t" \ + : /* Outputs */ \ + [dest]"+r"(dest), \ + [x]"+r"(remain) \ + op##_0_32_PASS_TEMPORARIES \ + : /* Inputs */ \ + : /* Clobbers */ \ + "memory" \ + ); \ + dest += destPitch; \ + } while (--height > 0); \ +} \ + +#define DEFINE_FAST_PATH_32_32(op,interleave,rw) \ +static void fastPath##op##_32_32(operation_t *o, uint32_t flags) \ +{ \ + op##_32_32_DECLARE_STACK; \ + op##_32_32_DECLARE_TEMPORARIES; \ + IGNORE(flags); \ + COPY_OP_TO_LOCALS(o, uint32_t, uint32_t); \ + uint32_t *src = srcBits + srcPitch * srcY + srcX; \ + uint32_t *dest = destBits + destPitch * destY + destX; \ + uint32_t *clut = *cmLookupTable; \ + srcPitch -= width; \ + destPitch -= width; \ + __asm__ volatile ( \ + op##_32_32_INIT \ + "" \ + : /* Outputs */ \ + [clut]"+r"(clut) \ + op##_32_32_PASS_TEMPORARIES \ + : /* Inputs */ \ + [halftone]"r"(halftoneBase) \ + op##_32_32_PASS_STACK \ + : /* Clobbers */ \ + "memory" \ + ); \ + do { \ + uint32_t remain = width; \ + __asm__ volatile ( \ + "tst %[dest], #4 \n\t" \ + "b.eq 10f \n\t" \ + op##_32_32_32 \ + "sub %[x], %[x], #1 \n\t" \ + "10: \n\t" \ + "subs %[x], %[x], #512/32 \n\t" \ + "b.mi 30f \n\t" \ + "20: \n\t" \ + LOAD_SRC_512_INTERLEAVE_##interleave \ + rw(LOAD_DEST_512_INTERLEAVE_##interleave) \ + op##_32_32_512 \ + STORE_DEST_512_INTERLEAVE_##interleave \ + "2: \n\t" \ + "add %[dest], %[dest], #512/8 \n\t" \ + "subs %[x], %[x], #512/32 \n\t" \ + "b.pl 20b \n\t" \ + "30: \n\t" \ + "add %[x], %[x], #512/32 \n\t" \ + "cmp %[x], #32/32 \n\t" \ + "b.lo 90f \n\t" \ + "b.eq 80f \n\t" \ + "cmp %[x], #256/32 \n\t" \ + "b.lo 70f \n\t" \ + "b.eq 60f \n\t" \ + "tbz %[x], #8-5, 8f \n\t" \ + "ld1 {v0.16b-v1.16b}, [%[src]], #256/8 \n\t" \ + rw("ld1 {v4.16b-v5.16b}, [%[dest]], #256/8 \n\t")\ + "8: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "ldr q2, [%[src]], #128/8 \n\t" \ + rw("ldr q6, [%[dest]], #128/8 \n\t")\ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "ldr d3, [%[src]], #64/8 \n\t" \ + rw("ldr d7, [%[dest]], #64/8 \n\t")\ + "6: \n\t" \ + "tbz %[x], #5-5, 5f \n\t" \ + "ldr s16, [%[src]], #32/8 \n\t" \ + rw("ldr s17, [%[dest]], #32/8 \n\t")\ + "mov v3.s[2], v16.s[0] \n\t" \ + rw("mov v7.s[2], v17.s[0] \n\t")\ + "5: \n\t" \ + rw("sub %[dest], %[dest], %[x], lsl #2 \n\t")\ + UNPACK_##interleave(16,rw) \ + op##_32_32_512 \ + PACK_##interleave(16) \ + "tbz %[x], #8-5, 8f \n\t" \ + "st1 {v0.16b-v1.16b}, [%[dest]], #256/8 \n\t" \ + "8: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "str q2, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "str d3, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 90f \n\t" \ + "mov v3.s[0], v3.s[2] \n\t" \ + "str s3, [%[dest]], #32/8 \n\t" \ + "b 90f \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], %[x], lsl #2 \n\t" \ + "b 90f \n\t" \ + "60: \n\t" \ + LOAD_SRC_256_INTERLEAVE_##interleave \ + rw(LOAD_DEST_256_INTERLEAVE_##interleave) \ + op##_32_32_256 \ + STORE_DEST_256_INTERLEAVE_##interleave \ + "2: \n\t" \ + "add %[dest], %[dest], #256/8 \n\t" \ + "b 90f \n\t" \ + "70: \n\t" \ + "tbz %[x], #7-5, 7f \n\t" \ + "ld1 {v0.8b-v1.8b}, [%[src]], #128/8 \n\t" \ + rw("ld1 {v4.8b-v5.8b}, [%[dest]], #128/8 \n\t")\ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "ldr d2, [%[src]], #64/8 \n\t" \ + rw("ldr d6, [%[dest]], #64/8 \n\t")\ + "6: \n\t" \ + "tbz %[x], #5-5, 5f \n\t" \ + "ldr s3, [%[src]], #32/8 \n\t" \ + rw("ldr s7, [%[dest]], #32/8 \n\t")\ + "5: \n\t" \ + rw("sub %[dest], %[dest], %[x], lsl #2 \n\t")\ + UNPACK_##interleave(8,rw) \ + op##_32_32_256 \ + PACK_##interleave(8) \ + "tbz %[x], #7-5, 7f \n\t" \ + "st1 {v0.8b-v1.8b}, [%[dest]], #128/8 \n\t" \ + "7: \n\t" \ + "tbz %[x], #6-5, 6f \n\t" \ + "str d2, [%[dest]], #64/8 \n\t" \ + "6: \n\t" \ + "tbz %[x], #5-5, 90f \n\t" \ + "str s3, [%[dest]], #32/8 \n\t" \ + "b 90f \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], %[x], lsl #2 \n\t" \ + "b 90f \n\t" \ + "80: \n\t" \ + op##_32_32_32 \ + "90: \n\t" \ + : /* Outputs */ \ + [src]"+r"(src), \ + [dest]"+r"(dest), \ + [x]"+r"(remain) \ + op##_32_32_PASS_TEMPORARIES \ + : /* Inputs */ \ + : /* Clobbers */ \ + "memory" \ + ); \ + src += srcPitch; \ + dest += destPitch; \ + } while (--height > 0); \ + __asm__ volatile ( \ + op##_32_32_FINAL \ + "" \ + : /* Outputs */ \ + : /* Inputs */ \ + [unused]"I"(0) \ + op##_32_32_PASS_STACK \ + ); \ +} \ + +#define DEFINE_FAST_PATH(op, src_dst, interleave, rw) DEFINE_FAST_PATH_##src_dst(op, interleave, rw) + +/******************************************************************************/ + +#define BitAnd_32_32_DECLARE_STACK + +#define BitAnd_32_32_PASS_STACK + +#define BitAnd_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitAnd_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitAnd_32_32_INIT + +#define BitAnd_32_32_FINAL + +#define BitAnd_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "and %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitAnd_32_32_256 \ + "and v0.8b, v0.8b, v4.8b \n\t" \ + "and v1.8b, v1.8b, v5.8b \n\t" \ + "and v2.8b, v2.8b, v6.8b \n\t" \ + "and v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitAnd_32_32_512 \ + "and v0.16b, v0.16b, v4.16b \n\t" \ + "and v1.16b, v1.16b, v5.16b \n\t" \ + "and v2.16b, v2.16b, v6.16b \n\t" \ + "and v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitAnd, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitAndInvert_32_32_DECLARE_STACK + +#define BitAndInvert_32_32_PASS_STACK + +#define BitAndInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitAndInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitAndInvert_32_32_INIT + +#define BitAndInvert_32_32_FINAL + +#define BitAndInvert_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "bic %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitAndInvert_32_32_256 \ + "bic v0.8b, v0.8b, v4.8b \n\t" \ + "bic v1.8b, v1.8b, v5.8b \n\t" \ + "bic v2.8b, v2.8b, v6.8b \n\t" \ + "bic v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitAndInvert_32_32_512 \ + "bic v0.16b, v0.16b, v4.16b \n\t" \ + "bic v1.16b, v1.16b, v5.16b \n\t" \ + "bic v2.16b, v2.16b, v6.16b \n\t" \ + "bic v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitAndInvert, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertAnd_32_32_DECLARE_STACK + +#define BitInvertAnd_32_32_PASS_STACK + +#define BitInvertAnd_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitInvertAnd_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitInvertAnd_32_32_INIT + +#define BitInvertAnd_32_32_FINAL + +#define BitInvertAnd_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "bic %w[tmp0], %w[tmp1], %w[tmp0] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitInvertAnd_32_32_256 \ + "bic v0.8b, v4.8b, v0.8b \n\t" \ + "bic v1.8b, v5.8b, v1.8b \n\t" \ + "bic v2.8b, v6.8b, v2.8b \n\t" \ + "bic v3.8b, v7.8b, v3.8b \n\t" \ + +#define BitInvertAnd_32_32_512 \ + "bic v0.16b, v4.16b, v0.16b \n\t" \ + "bic v1.16b, v5.16b, v1.16b \n\t" \ + "bic v2.16b, v6.16b, v2.16b \n\t" \ + "bic v3.16b, v7.16b, v3.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertAnd, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitXor_32_32_DECLARE_STACK + +#define BitXor_32_32_PASS_STACK + +#define BitXor_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitXor_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitXor_32_32_INIT + +#define BitXor_32_32_FINAL + +#define BitXor_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "eor %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitXor_32_32_256 \ + "eor v0.8b, v0.8b, v4.8b \n\t" \ + "eor v1.8b, v1.8b, v5.8b \n\t" \ + "eor v2.8b, v2.8b, v6.8b \n\t" \ + "eor v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitXor_32_32_512 \ + "eor v0.16b, v0.16b, v4.16b \n\t" \ + "eor v1.16b, v1.16b, v5.16b \n\t" \ + "eor v2.16b, v2.16b, v6.16b \n\t" \ + "eor v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitXor, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitOr_32_32_DECLARE_STACK + +#define BitOr_32_32_PASS_STACK + +#define BitOr_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitOr_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitOr_32_32_INIT + +#define BitOr_32_32_FINAL + +#define BitOr_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "orr %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitOr_32_32_256 \ + "orr v0.8b, v0.8b, v4.8b \n\t" \ + "orr v1.8b, v1.8b, v5.8b \n\t" \ + "orr v2.8b, v2.8b, v6.8b \n\t" \ + "orr v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitOr_32_32_512 \ + "orr v0.16b, v0.16b, v4.16b \n\t" \ + "orr v1.16b, v1.16b, v5.16b \n\t" \ + "orr v2.16b, v2.16b, v6.16b \n\t" \ + "orr v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitOr, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertAndInvert_32_32_DECLARE_STACK + +#define BitInvertAndInvert_32_32_PASS_STACK + +#define BitInvertAndInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitInvertAndInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitInvertAndInvert_32_32_INIT + +#define BitInvertAndInvert_32_32_FINAL + +#define BitInvertAndInvert_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "orr %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "mvn %w[tmp0], %w[tmp0] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitInvertAndInvert_32_32_256 \ + "orr v0.8b, v0.8b, v4.8b \n\t" \ + "orr v1.8b, v1.8b, v5.8b \n\t" \ + "orr v2.8b, v2.8b, v6.8b \n\t" \ + "orr v3.8b, v3.8b, v7.8b \n\t" \ + "mvn v0.8b, v0.8b \n\t" \ + "mvn v1.8b, v1.8b \n\t" \ + "mvn v2.8b, v2.8b \n\t" \ + "mvn v3.8b, v3.8b \n\t" \ + +#define BitInvertAndInvert_32_32_512 \ + "orr v0.16b, v0.16b, v4.16b \n\t" \ + "orr v1.16b, v1.16b, v5.16b \n\t" \ + "orr v2.16b, v2.16b, v6.16b \n\t" \ + "orr v3.16b, v3.16b, v7.16b \n\t" \ + "mvn v0.16b, v0.16b \n\t" \ + "mvn v1.16b, v1.16b \n\t" \ + "mvn v2.16b, v2.16b \n\t" \ + "mvn v3.16b, v3.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertAndInvert, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertXor_32_32_DECLARE_STACK + +#define BitInvertXor_32_32_PASS_STACK + +#define BitInvertXor_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitInvertXor_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitInvertXor_32_32_INIT + +#define BitInvertXor_32_32_FINAL + +#define BitInvertXor_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "mvn %w[tmp0], %w[tmp0] \n\t" \ + "eor %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitInvertXor_32_32_256 \ + "mvn v0.8b, v0.8b \n\t" \ + "mvn v1.8b, v1.8b \n\t" \ + "mvn v2.8b, v2.8b \n\t" \ + "mvn v3.8b, v3.8b \n\t" \ + "eor v0.8b, v0.8b, v4.8b \n\t" \ + "eor v1.8b, v1.8b, v5.8b \n\t" \ + "eor v2.8b, v2.8b, v6.8b \n\t" \ + "eor v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitInvertXor_32_32_512 \ + "mvn v0.16b, v0.16b \n\t" \ + "mvn v1.16b, v1.16b \n\t" \ + "mvn v2.16b, v2.16b \n\t" \ + "mvn v3.16b, v3.16b \n\t" \ + "eor v0.16b, v0.16b, v4.16b \n\t" \ + "eor v1.16b, v1.16b, v5.16b \n\t" \ + "eor v2.16b, v2.16b, v6.16b \n\t" \ + "eor v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertXor, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertDestination_0_32_DECLARE_STACK + +#define BitInvertDestination_0_32_PASS_STACK + +#define BitInvertDestination_0_32_DECLARE_TEMPORARIES uint64_t tmp + +#define BitInvertDestination_0_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp) + +#define BitInvertDestination_0_32_INIT + +#define BitInvertDestination_0_32_FINAL + +#define BitInvertDestination_0_32_32 \ + "ldr %w[tmp], [%[dest]] \n\t" \ + "mvn %w[tmp], %w[tmp] \n\t" \ + "str %w[tmp], [%[dest]], #4 \n\t" \ + +#define BitInvertDestination_0_32_256 \ + "mvn v0.8b, v4.8b \n\t" \ + "mvn v1.8b, v5.8b \n\t" \ + "mvn v2.8b, v6.8b \n\t" \ + "mvn v3.8b, v7.8b \n\t" \ + +#define BitInvertDestination_0_32_512 \ + "mvn v0.16b, v4.16b \n\t" \ + "mvn v1.16b, v5.16b \n\t" \ + "mvn v2.16b, v6.16b \n\t" \ + "mvn v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertDestination, 0_32, NO, YES) + +/******************************************************************************/ + +#define BitOrInvert_32_32_DECLARE_STACK + +#define BitOrInvert_32_32_PASS_STACK + +#define BitOrInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitOrInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitOrInvert_32_32_INIT + +#define BitOrInvert_32_32_FINAL + +#define BitOrInvert_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "orn %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitOrInvert_32_32_256 \ + "orn v0.8b, v0.8b, v4.8b \n\t" \ + "orn v1.8b, v1.8b, v5.8b \n\t" \ + "orn v2.8b, v2.8b, v6.8b \n\t" \ + "orn v3.8b, v3.8b, v7.8b \n\t" \ + +#define BitOrInvert_32_32_512 \ + "orn v0.16b, v0.16b, v4.16b \n\t" \ + "orn v1.16b, v1.16b, v5.16b \n\t" \ + "orn v2.16b, v2.16b, v6.16b \n\t" \ + "orn v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(BitOrInvert, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertSource_32_32_DECLARE_STACK + +#define BitInvertSource_32_32_PASS_STACK + +#define BitInvertSource_32_32_DECLARE_TEMPORARIES uint64_t tmp + +#define BitInvertSource_32_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp) + +#define BitInvertSource_32_32_INIT + +#define BitInvertSource_32_32_FINAL + +#define BitInvertSource_32_32_32 \ + "ldr %w[tmp], [%[src]], #4 \n\t" \ + "mvn %w[tmp], %w[tmp] \n\t" \ + "str %w[tmp], [%[dest]], #4 \n\t" \ + +#define BitInvertSource_32_32_256 \ + "mvn v0.8b, v0.8b \n\t" \ + "mvn v1.8b, v1.8b \n\t" \ + "mvn v2.8b, v2.8b \n\t" \ + "mvn v3.8b, v3.8b \n\t" \ + +#define BitInvertSource_32_32_512 \ + "mvn v0.16b, v0.16b \n\t" \ + "mvn v1.16b, v1.16b \n\t" \ + "mvn v2.16b, v2.16b \n\t" \ + "mvn v3.16b, v3.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertSource, 32_32, NO, NO) + +/******************************************************************************/ + +#define BitInvertOr_32_32_DECLARE_STACK + +#define BitInvertOr_32_32_PASS_STACK + +#define BitInvertOr_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitInvertOr_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitInvertOr_32_32_INIT + +#define BitInvertOr_32_32_FINAL + +#define BitInvertOr_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "orn %w[tmp0], %w[tmp1], %w[tmp0] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitInvertOr_32_32_256 \ + "orn v0.8b, v4.8b, v0.8b \n\t" \ + "orn v1.8b, v5.8b, v1.8b \n\t" \ + "orn v2.8b, v6.8b, v2.8b \n\t" \ + "orn v3.8b, v7.8b, v3.8b \n\t" \ + +#define BitInvertOr_32_32_512 \ + "orn v0.16b, v4.16b, v0.16b \n\t" \ + "orn v1.16b, v5.16b, v1.16b \n\t" \ + "orn v2.16b, v6.16b, v2.16b \n\t" \ + "orn v3.16b, v7.16b, v3.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertOr, 32_32, NO, YES) + +/******************************************************************************/ + +#define BitInvertOrInvert_32_32_DECLARE_STACK + +#define BitInvertOrInvert_32_32_PASS_STACK + +#define BitInvertOrInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define BitInvertOrInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define BitInvertOrInvert_32_32_INIT + +#define BitInvertOrInvert_32_32_FINAL + +#define BitInvertOrInvert_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "and %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "mvn %w[tmp0], %w[tmp0] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define BitInvertOrInvert_32_32_256 \ + "and v0.8b, v0.8b, v4.8b \n\t" \ + "and v1.8b, v1.8b, v5.8b \n\t" \ + "and v2.8b, v2.8b, v6.8b \n\t" \ + "and v3.8b, v3.8b, v7.8b \n\t" \ + "mvn v0.8b, v0.8b \n\t" \ + "mvn v1.8b, v1.8b \n\t" \ + "mvn v2.8b, v2.8b \n\t" \ + "mvn v3.8b, v3.8b \n\t" \ + +#define BitInvertOrInvert_32_32_512 \ + "and v0.16b, v0.16b, v4.16b \n\t" \ + "and v1.16b, v1.16b, v5.16b \n\t" \ + "and v2.16b, v2.16b, v6.16b \n\t" \ + "and v3.16b, v3.16b, v7.16b \n\t" \ + "mvn v0.16b, v0.16b \n\t" \ + "mvn v1.16b, v1.16b \n\t" \ + "mvn v2.16b, v2.16b \n\t" \ + "mvn v3.16b, v3.16b \n\t" \ + +DEFINE_FAST_PATH(BitInvertOrInvert, 32_32, NO, YES) + +/******************************************************************************/ + +#define AddWord_32_32_DECLARE_STACK + +#define AddWord_32_32_PASS_STACK + +#define AddWord_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define AddWord_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define AddWord_32_32_INIT + +#define AddWord_32_32_FINAL + +#define AddWord_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "add %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define AddWord_32_32_256 \ + "add v0.2s, v0.2s, v4.2s \n\t" \ + "add v1.2s, v1.2s, v5.2s \n\t" \ + "add v2.2s, v2.2s, v6.2s \n\t" \ + "add v3.2s, v3.2s, v7.2s \n\t" \ + +#define AddWord_32_32_512 \ + "add v0.4s, v0.4s, v4.4s \n\t" \ + "add v1.4s, v1.4s, v5.4s \n\t" \ + "add v2.4s, v2.4s, v6.4s \n\t" \ + "add v3.4s, v3.4s, v7.4s \n\t" \ + +DEFINE_FAST_PATH(AddWord, 32_32, NO, YES) + +/******************************************************************************/ + +#define SubWord_32_32_DECLARE_STACK + +#define SubWord_32_32_PASS_STACK + +#define SubWord_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define SubWord_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define SubWord_32_32_INIT + +#define SubWord_32_32_FINAL + +#define SubWord_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "ldr %w[tmp1], [%[dest]] \n\t" \ + "sub %w[tmp0], %w[tmp0], %w[tmp1] \n\t" \ + "str %w[tmp0], [%[dest]], #4 \n\t" \ + +#define SubWord_32_32_256 \ + "sub v0.2s, v0.2s, v4.2s \n\t" \ + "sub v1.2s, v1.2s, v5.2s \n\t" \ + "sub v2.2s, v2.2s, v6.2s \n\t" \ + "sub v3.2s, v3.2s, v7.2s \n\t" \ + +#define SubWord_32_32_512 \ + "sub v0.4s, v0.4s, v4.4s \n\t" \ + "sub v1.4s, v1.4s, v5.4s \n\t" \ + "sub v2.4s, v2.4s, v6.4s \n\t" \ + "sub v3.4s, v3.4s, v7.4s \n\t" \ + +DEFINE_FAST_PATH(SubWord, 32_32, NO, YES) + +/******************************************************************************/ + +#define RgbAdd_32_32_DECLARE_STACK + +#define RgbAdd_32_32_PASS_STACK + +#define RgbAdd_32_32_DECLARE_TEMPORARIES + +#define RgbAdd_32_32_PASS_TEMPORARIES + +#define RgbAdd_32_32_INIT + +#define RgbAdd_32_32_FINAL + +#define RgbAdd_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "uqadd v0.8b, v0.8b, v4.8b \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define RgbAdd_32_32_256 \ + "uqadd v0.8b, v0.8b, v4.8b \n\t" \ + "uqadd v1.8b, v1.8b, v5.8b \n\t" \ + "uqadd v2.8b, v2.8b, v6.8b \n\t" \ + "uqadd v3.8b, v3.8b, v7.8b \n\t" \ + +#define RgbAdd_32_32_512 \ + "uqadd v0.16b, v0.16b, v4.16b \n\t" \ + "uqadd v1.16b, v1.16b, v5.16b \n\t" \ + "uqadd v2.16b, v2.16b, v6.16b \n\t" \ + "uqadd v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(RgbAdd, 32_32, NO, YES) + +/******************************************************************************/ + +#define RgbSub_32_32_DECLARE_STACK + +#define RgbSub_32_32_PASS_STACK + +#define RgbSub_32_32_DECLARE_TEMPORARIES + +#define RgbSub_32_32_PASS_TEMPORARIES + +#define RgbSub_32_32_INIT + +#define RgbSub_32_32_FINAL + +#define RgbSub_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "uabd v0.8b, v0.8b, v4.8b \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define RgbSub_32_32_256 \ + "uabd v0.8b, v0.8b, v4.8b \n\t" \ + "uabd v1.8b, v1.8b, v5.8b \n\t" \ + "uabd v2.8b, v2.8b, v6.8b \n\t" \ + "uabd v3.8b, v3.8b, v7.8b \n\t" \ + +#define RgbSub_32_32_512 \ + "uabd v0.16b, v0.16b, v4.16b \n\t" \ + "uabd v1.16b, v1.16b, v5.16b \n\t" \ + "uabd v2.16b, v2.16b, v6.16b \n\t" \ + "uabd v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(RgbSub, 32_32, NO, YES) + +/******************************************************************************/ + +#define AlphaBlend_32_32_DECLARE_STACK + +#define AlphaBlend_32_32_PASS_STACK + +#define AlphaBlend_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define AlphaBlend_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define AlphaBlend_32_32_INIT \ + "movi v31.16b, #0xFF \n\t" \ + "ldr s30, =0xFF000000 \n\t" \ + +#define AlphaBlend_32_32_FINAL + +#define AlphaBlend_32_32_32 \ + "ldrb %w[tmp0], [%[src], #3] \n\t" \ + "ldr s0, [%[src]], #4 \n\t" \ + "cbz %[tmp0], 2f \n\t" \ + "cmp %[tmp0], #0xFF \n\t" \ + "b.eq 1f \n\t" \ + "ldr s1, [%[dest]] \n\t" \ + "orr v2.8b, v30.8b, v0.8b \n\t" \ + "mvn v3.8b, v0.8b \n\t" \ + "dup v0.8b, v0.b[3] \n\t" \ + "dup v3.8b, v3.b[3] \n\t" \ + "umull v0.8h, v2.8b, v0.8b \n\t" \ + "umlal v0.8h, v1.8b, v3.8b \n\t" \ + "cmtst v1.4h, v0.4h, v31.4h \n\t" \ + "usra v0.4h, v0.4h, #8 \n\t" \ + "sub v0.8b, v0.8b, v1.8b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "1: \n\t" \ + "str s0, [%[dest]] \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], #4 \n\t" \ + +#define AlphaBlend_32_32_256 \ + "mov %[tmp0], v3.d[0] \n\t" \ + "cbz %[tmp0], 2f \n\t" \ + "cmp %[tmp0], #-1 \n\t" \ + "b.eq 1f \n\t" \ + "mvn v16.8b, v3.8b \n\t" \ + "umull v0.8h, v0.8b, v3.8b \n\t" \ + "umlal v0.8h, v4.8b, v16.8b \n\t" \ + "umull v1.8h, v1.8b, v3.8b \n\t" \ + "umlal v1.8h, v5.8b, v16.8b \n\t" \ + "umull v2.8h, v2.8b, v3.8b \n\t" \ + "umlal v2.8h, v6.8b, v16.8b \n\t" \ + "umull v3.8h, v31.8b, v3.8b \n\t" \ + "umlal v3.8h, v7.8b, v16.8b \n\t" \ + "cmtst v4.8h, v0.8h, v31.8h \n\t" \ + "cmtst v5.8h, v1.8h, v31.8h \n\t" \ + "cmtst v6.8h, v2.8h, v31.8h \n\t" \ + "cmtst v7.8h, v3.8h, v31.8h \n\t" \ + "usra v0.8h, v0.8h, #8 \n\t" \ + "usra v1.8h, v1.8h, #8 \n\t" \ + "usra v2.8h, v2.8h, #8 \n\t" \ + "usra v3.8h, v3.8h, #8 \n\t" \ + "sub v0.16b, v0.16b, v4.16b \n\t" \ + "sub v1.16b, v1.16b, v5.16b \n\t" \ + "sub v2.16b, v2.16b, v6.16b \n\t" \ + "sub v3.16b, v3.16b, v7.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v1.8b, v1.8h, #8 \n\t" \ + "shrn v2.8b, v2.8h, #8 \n\t" \ + "shrn v3.8b, v3.8h, #8 \n\t" \ + "1: \n\t" \ + +#define AlphaBlend_32_32_512 \ + "mov %[tmp0], v3.d[0] \n\t" \ + "mov %[tmp1], v3.d[1] \n\t" \ + "cmp %[tmp0], #0 \n\t" \ + "ccmp %[tmp1], #0, #0, eq \n\t" \ + "b.eq 2f \n\t" \ + "cmn %[tmp0], #1 \n\t" \ + "ccmn %[tmp1], #1, #0, eq \n\t" \ + "b.eq 1f \n\t" \ + "mvn v16.16b, v3.16b \n\t" \ + "umull v20.8h, v0.8b, v3.8b \n\t" \ + "umlal v20.8h, v4.8b, v16.8b \n\t" \ + "umull2 v21.8h, v0.16b, v3.16b \n\t" \ + "umlal2 v21.8h, v4.16b, v16.16b \n\t" \ + "umull v22.8h, v1.8b, v3.8b \n\t" \ + "umlal v22.8h, v5.8b, v16.8b \n\t" \ + "umull2 v23.8h, v1.16b, v3.16b \n\t" \ + "umlal2 v23.8h, v5.16b, v16.16b \n\t" \ + "umull v24.8h, v2.8b, v3.8b \n\t" \ + "umlal v24.8h, v6.8b, v16.8b \n\t" \ + "umull2 v25.8h, v2.16b, v3.16b \n\t" \ + "umlal2 v25.8h, v6.16b, v16.16b \n\t" \ + "umull v26.8h, v31.8b, v3.8b \n\t" \ + "umlal v26.8h, v7.8b, v16.8b \n\t" \ + "umull2 v27.8h, v31.16b, v3.16b \n\t" \ + "umlal2 v27.8h, v7.16b, v16.16b \n\t" \ + "cmtst v0.8h, v20.8h, v31.8h \n\t" \ + "cmtst v1.8h, v21.8h, v31.8h \n\t" \ + "cmtst v2.8h, v22.8h, v31.8h \n\t" \ + "cmtst v3.8h, v23.8h, v31.8h \n\t" \ + "cmtst v4.8h, v24.8h, v31.8h \n\t" \ + "cmtst v5.8h, v25.8h, v31.8h \n\t" \ + "cmtst v6.8h, v26.8h, v31.8h \n\t" \ + "cmtst v7.8h, v27.8h, v31.8h \n\t" \ + "usra v20.8h, v20.8h, #8 \n\t" \ + "usra v21.8h, v21.8h, #8 \n\t" \ + "usra v22.8h, v22.8h, #8 \n\t" \ + "usra v23.8h, v23.8h, #8 \n\t" \ + "usra v24.8h, v24.8h, #8 \n\t" \ + "usra v25.8h, v25.8h, #8 \n\t" \ + "usra v26.8h, v26.8h, #8 \n\t" \ + "usra v27.8h, v27.8h, #8 \n\t" \ + "sub v0.16b, v20.16b, v0.16b \n\t" \ + "sub v1.16b, v21.16b, v1.16b \n\t" \ + "sub v2.16b, v22.16b, v2.16b \n\t" \ + "sub v3.16b, v23.16b, v3.16b \n\t" \ + "sub v4.16b, v24.16b, v4.16b \n\t" \ + "sub v5.16b, v25.16b, v5.16b \n\t" \ + "sub v6.16b, v26.16b, v6.16b \n\t" \ + "sub v7.16b, v27.16b, v7.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v20.8b, v1.8h, #8 \n\t" \ + "shrn v1.8b, v2.8h, #8 \n\t" \ + "shrn v21.8b, v3.8h, #8 \n\t" \ + "shrn v2.8b, v4.8h, #8 \n\t" \ + "shrn v22.8b, v5.8h, #8 \n\t" \ + "shrn v3.8b, v6.8h, #8 \n\t" \ + "shrn v23.8b, v7.8h, #8 \n\t" \ + "mov v0.d[1], v20.d[0] \n\t" \ + "mov v1.d[1], v21.d[0] \n\t" \ + "mov v2.d[1], v22.d[0] \n\t" \ + "mov v3.d[1], v23.d[0] \n\t" \ + "1: \n\t" \ + +DEFINE_FAST_PATH(AlphaBlend, 32_32, YES, YES) + +/******************************************************************************/ + +#define PixPaint_32_32_DECLARE_STACK + +#define PixPaint_32_32_PASS_STACK + +#define PixPaint_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1 + +#define PixPaint_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1) + +#define PixPaint_32_32_INIT + +#define PixPaint_32_32_FINAL + +#define PixPaint_32_32_32 \ + "ldr %w[tmp0], [%[src]], #4 \n\t" \ + "cbz %[tmp0], 2f \n\t" \ + "str %w[tmp0], [%[dest]] \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], #4 \n\t" \ + +#define PixPaint_32_32_256 \ + "orr v16.8b, v0.8b, v1.8b \n\t" \ + "orr v17.8b, v2.8b, v3.8b \n\t" \ + "orr v16.8b, v16.8b, v17.8b \n\t" \ + "mov %[tmp0], v16.d[0] \n\t" \ + "cbz %[tmp0], 2f \n\t" \ + "cmeq v16.2s, v0.2s, #0 \n\t" \ + "cmeq v17.2s, v1.2s, #0 \n\t" \ + "cmeq v18.2s, v2.2s, #0 \n\t" \ + "cmeq v19.2s, v3.2s, #0 \n\t" \ + "bit v0.8b, v4.8b, v16.8b \n\t" \ + "bit v1.8b, v5.8b, v17.8b \n\t" \ + "bit v2.8b, v6.8b, v18.8b \n\t" \ + "bit v3.8b, v7.8b, v19.8b \n\t" \ + +#define PixPaint_32_32_512 \ + "orr v16.16b, v0.16b, v1.16b \n\t" \ + "orr v17.16b, v2.16b, v3.16b \n\t" \ + "orr v16.16b, v16.16b, v17.16b \n\t" \ + "mov %[tmp0], v16.d[0] \n\t" \ + "mov %[tmp1], v16.d[1] \n\t" \ + "orr %[tmp0], %[tmp0], %[tmp1] \n\t" \ + "cbz %[tmp0], 2f \n\t" \ + "cmeq v16.4s, v0.4s, #0 \n\t" \ + "cmeq v17.4s, v1.4s, #0 \n\t" \ + "cmeq v18.4s, v2.4s, #0 \n\t" \ + "cmeq v19.4s, v3.4s, #0 \n\t" \ + "bit v0.16b, v4.16b, v16.16b \n\t" \ + "bit v1.16b, v5.16b, v17.16b \n\t" \ + "bit v2.16b, v6.16b, v18.16b \n\t" \ + "bit v3.16b, v7.16b, v19.16b \n\t" \ + +DEFINE_FAST_PATH(PixPaint, 32_32, NO, YES) + +/******************************************************************************/ + +#define PixMask_32_32_DECLARE_STACK + +#define PixMask_32_32_PASS_STACK + +#define PixMask_32_32_DECLARE_TEMPORARIES uint64_t tmp + +#define PixMask_32_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp) + +#define PixMask_32_32_INIT + +#define PixMask_32_32_FINAL + +#define PixMask_32_32_32 \ + "ldr %w[tmp], [%[src]], #4 \n\t" \ + "cbz %[tmp], 2f \n\t" \ + "str wzr, [%[dest]] \n\t" \ + "2: \n\t" \ + "add %[dest], %[dest], #4 \n\t" \ + +#define PixMask_32_32_256 \ + "cmeq v16.2s, v0.2s, #0 \n\t" \ + "cmeq v17.2s, v1.2s, #0 \n\t" \ + "cmeq v18.2s, v2.2s, #0 \n\t" \ + "cmeq v19.2s, v3.2s, #0 \n\t" \ + "movi v0.8b, #0 \n\t" \ + "movi v1.8b, #0 \n\t" \ + "movi v2.8b, #0 \n\t" \ + "movi v3.8b, #0 \n\t" \ + "bit v0.8b, v4.8b, v16.8b \n\t" \ + "bit v1.8b, v5.8b, v17.8b \n\t" \ + "bit v2.8b, v6.8b, v18.8b \n\t" \ + "bit v3.8b, v7.8b, v19.8b \n\t" \ + +#define PixMask_32_32_512 \ + "cmeq v16.4s, v0.4s, #0 \n\t" \ + "cmeq v17.4s, v1.4s, #0 \n\t" \ + "cmeq v18.4s, v2.4s, #0 \n\t" \ + "cmeq v19.4s, v3.4s, #0 \n\t" \ + "movi v0.16b, #0 \n\t" \ + "movi v1.16b, #0 \n\t" \ + "movi v2.16b, #0 \n\t" \ + "movi v3.16b, #0 \n\t" \ + "bit v0.16b, v4.16b, v16.16b \n\t" \ + "bit v1.16b, v5.16b, v17.16b \n\t" \ + "bit v2.16b, v6.16b, v18.16b \n\t" \ + "bit v3.16b, v7.16b, v19.16b \n\t" \ + +DEFINE_FAST_PATH(PixMask, 32_32, NO, YES) + +/******************************************************************************/ + +#define RgbMax_32_32_DECLARE_STACK + +#define RgbMax_32_32_PASS_STACK + +#define RgbMax_32_32_DECLARE_TEMPORARIES + +#define RgbMax_32_32_PASS_TEMPORARIES + +#define RgbMax_32_32_INIT + +#define RgbMax_32_32_FINAL + +#define RgbMax_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "umax v0.8b, v0.8b, v4.8b \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define RgbMax_32_32_256 \ + "umax v0.8b, v0.8b, v4.8b \n\t" \ + "umax v1.8b, v1.8b, v5.8b \n\t" \ + "umax v2.8b, v2.8b, v6.8b \n\t" \ + "umax v3.8b, v3.8b, v7.8b \n\t" \ + +#define RgbMax_32_32_512 \ + "umax v0.16b, v0.16b, v4.16b \n\t" \ + "umax v1.16b, v1.16b, v5.16b \n\t" \ + "umax v2.16b, v2.16b, v6.16b \n\t" \ + "umax v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(RgbMax, 32_32, NO, YES) + +/******************************************************************************/ + +#define RgbMin_32_32_DECLARE_STACK + +#define RgbMin_32_32_PASS_STACK + +#define RgbMin_32_32_DECLARE_TEMPORARIES + +#define RgbMin_32_32_PASS_TEMPORARIES + +#define RgbMin_32_32_INIT + +#define RgbMin_32_32_FINAL + +#define RgbMin_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "umin v0.8b, v0.8b, v4.8b \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define RgbMin_32_32_256 \ + "umin v0.8b, v0.8b, v4.8b \n\t" \ + "umin v1.8b, v1.8b, v5.8b \n\t" \ + "umin v2.8b, v2.8b, v6.8b \n\t" \ + "umin v3.8b, v3.8b, v7.8b \n\t" \ + +#define RgbMin_32_32_512 \ + "umin v0.16b, v0.16b, v4.16b \n\t" \ + "umin v1.16b, v1.16b, v5.16b \n\t" \ + "umin v2.16b, v2.16b, v6.16b \n\t" \ + "umin v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(RgbMin, 32_32, NO, YES) + +/******************************************************************************/ + +#define RgbMinInvert_32_32_DECLARE_STACK + +#define RgbMinInvert_32_32_PASS_STACK + +#define RgbMinInvert_32_32_DECLARE_TEMPORARIES + +#define RgbMinInvert_32_32_PASS_TEMPORARIES + +#define RgbMinInvert_32_32_INIT + +#define RgbMinInvert_32_32_FINAL + +#define RgbMinInvert_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "mvn v0.8b, v0.8b \n\t" \ + "umin v0.8b, v0.8b, v4.8b \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define RgbMinInvert_32_32_256 \ + "mvn v0.8b, v0.8b \n\t" \ + "mvn v1.8b, v1.8b \n\t" \ + "mvn v2.8b, v2.8b \n\t" \ + "mvn v3.8b, v3.8b \n\t" \ + "umin v0.8b, v0.8b, v4.8b \n\t" \ + "umin v1.8b, v1.8b, v5.8b \n\t" \ + "umin v2.8b, v2.8b, v6.8b \n\t" \ + "umin v3.8b, v3.8b, v7.8b \n\t" \ + +#define RgbMinInvert_32_32_512 \ + "mvn v0.16b, v0.16b \n\t" \ + "mvn v1.16b, v1.16b \n\t" \ + "mvn v2.16b, v2.16b \n\t" \ + "mvn v3.16b, v3.16b \n\t" \ + "umin v0.16b, v0.16b, v4.16b \n\t" \ + "umin v1.16b, v1.16b, v5.16b \n\t" \ + "umin v2.16b, v2.16b, v6.16b \n\t" \ + "umin v3.16b, v3.16b, v7.16b \n\t" \ + +DEFINE_FAST_PATH(RgbMinInvert, 32_32, NO, YES) + +/******************************************************************************/ + +#define AlphaBlendConst_32_32_DECLARE_STACK + +#define AlphaBlendConst_32_32_PASS_STACK + +#define AlphaBlendConst_32_32_DECLARE_TEMPORARIES uint32_t alpha = o->opt.sourceAlpha, unAlpha = alpha ^ 0xFF + +#define AlphaBlendConst_32_32_PASS_TEMPORARIES , [alpha]"+r"(alpha), [unAlpha]"+r"(unAlpha) + +#define AlphaBlendConst_32_32_INIT \ + "dup v28.16b, %w[alpha] \n\t" \ + "dup v29.16b, %w[unAlpha] \n\t" \ + "movi v31.16b, #0xFF \n\t" \ + +#define AlphaBlendConst_32_32_FINAL + +#define AlphaBlendConst_32_32_32 \ + "ldr s0, [%[src]], #4 \n\t" \ + "ldr s4, [%[dest]] \n\t" \ + "umull v16.8h, v0.8b, v28.8b \n\t" \ + "umlal v16.8h, v4.8b, v29.8b \n\t" \ + "cmtst v0.8h, v16.8h, v31.8h \n\t" \ + "usra v16.8h, v16.8h, #8 \n\t" \ + "sub v0.16b, v16.16b, v0.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "str s0, [%[dest]], #4 \n\t" \ + +#define AlphaBlendConst_32_32_256 \ + "umull v16.8h, v0.8b, v28.8b \n\t" \ + "umlal v16.8h, v4.8b, v29.8b \n\t" \ + "umull v17.8h, v1.8b, v28.8b \n\t" \ + "umlal v17.8h, v5.8b, v29.8b \n\t" \ + "umull v18.8h, v2.8b, v28.8b \n\t" \ + "umlal v18.8h, v6.8b, v29.8b \n\t" \ + "umull v19.8h, v3.8b, v28.8b \n\t" \ + "umlal v19.8h, v7.8b, v29.8b \n\t" \ + "cmtst v0.8h, v16.8h, v31.8h \n\t" \ + "cmtst v1.8h, v17.8h, v31.8h \n\t" \ + "cmtst v2.8h, v18.8h, v31.8h \n\t" \ + "cmtst v3.8h, v19.8h, v31.8h \n\t" \ + "usra v16.8h, v16.8h, #8 \n\t" \ + "usra v17.8h, v17.8h, #8 \n\t" \ + "usra v18.8h, v18.8h, #8 \n\t" \ + "usra v19.8h, v19.8h, #8 \n\t" \ + "sub v0.16b, v16.16b, v0.16b \n\t" \ + "sub v1.16b, v17.16b, v1.16b \n\t" \ + "sub v2.16b, v18.16b, v2.16b \n\t" \ + "sub v3.16b, v19.16b, v3.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v1.8b, v1.8h, #8 \n\t" \ + "shrn v2.8b, v2.8h, #8 \n\t" \ + "shrn v3.8b, v3.8h, #8 \n\t" \ + +#define AlphaBlendConst_32_32_512 \ + "umull v16.8h, v0.8b, v28.8b \n\t" \ + "umlal v16.8h, v4.8b, v29.8b \n\t" \ + "umull2 v17.8h, v0.16b, v28.16b \n\t" \ + "umlal2 v17.8h, v4.16b, v29.16b \n\t" \ + "umull v18.8h, v1.8b, v28.8b \n\t" \ + "umlal v18.8h, v5.8b, v29.8b \n\t" \ + "umull2 v19.8h, v1.16b, v28.16b \n\t" \ + "umlal2 v19.8h, v5.16b, v29.16b \n\t" \ + "umull v20.8h, v2.8b, v28.8b \n\t" \ + "umlal v20.8h, v6.8b, v29.8b \n\t" \ + "umull2 v21.8h, v2.16b, v28.16b \n\t" \ + "umlal2 v21.8h, v6.16b, v29.16b \n\t" \ + "umull v22.8h, v3.8b, v28.8b \n\t" \ + "umlal v22.8h, v7.8b, v29.8b \n\t" \ + "umull2 v23.8h, v3.16b, v28.16b \n\t" \ + "umlal2 v23.8h, v7.16b, v29.16b \n\t" \ + "cmtst v0.8h, v16.8h, v31.8h \n\t" \ + "cmtst v1.8h, v17.8h, v31.8h \n\t" \ + "cmtst v2.8h, v18.8h, v31.8h \n\t" \ + "cmtst v3.8h, v19.8h, v31.8h \n\t" \ + "cmtst v4.8h, v20.8h, v31.8h \n\t" \ + "cmtst v5.8h, v21.8h, v31.8h \n\t" \ + "cmtst v6.8h, v22.8h, v31.8h \n\t" \ + "cmtst v7.8h, v23.8h, v31.8h \n\t" \ + "usra v16.8h, v16.8h, #8 \n\t" \ + "usra v17.8h, v17.8h, #8 \n\t" \ + "usra v18.8h, v18.8h, #8 \n\t" \ + "usra v19.8h, v19.8h, #8 \n\t" \ + "usra v20.8h, v20.8h, #8 \n\t" \ + "usra v21.8h, v21.8h, #8 \n\t" \ + "usra v22.8h, v22.8h, #8 \n\t" \ + "usra v23.8h, v23.8h, #8 \n\t" \ + "sub v0.16b, v16.16b, v0.16b \n\t" \ + "sub v1.16b, v17.16b, v1.16b \n\t" \ + "sub v2.16b, v18.16b, v2.16b \n\t" \ + "sub v3.16b, v19.16b, v3.16b \n\t" \ + "sub v4.16b, v20.16b, v4.16b \n\t" \ + "sub v5.16b, v21.16b, v5.16b \n\t" \ + "sub v6.16b, v22.16b, v6.16b \n\t" \ + "sub v7.16b, v23.16b, v7.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v16.8b, v1.8h, #8 \n\t" \ + "shrn v1.8b, v2.8h, #8 \n\t" \ + "shrn v17.8b, v3.8h, #8 \n\t" \ + "shrn v2.8b, v4.8h, #8 \n\t" \ + "shrn v18.8b, v5.8h, #8 \n\t" \ + "shrn v3.8b, v6.8h, #8 \n\t" \ + "shrn v19.8b, v7.8h, #8 \n\t" \ + "mov v0.d[1], v16.d[0] \n\t" \ + "mov v1.d[1], v17.d[0] \n\t" \ + "mov v2.d[1], v18.d[0] \n\t" \ + "mov v3.d[1], v19.d[0] \n\t" \ + +DEFINE_FAST_PATH(AlphaBlendConst, 32_32, NO, YES) + +/******************************************************************************/ + +static void fastPathSourceWord_1_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + const unsigned srcBPP = 1; + const unsigned destBPP = 32; + uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32; + uint32_t *dest = destBits + destPitch * destY + destX; + + uint32_t *clut = *cmLookupTable; + __asm__ volatile ( + "ldr d6, =0x10000000001 \n\t" + "ld1r {v16.4s}, [%[clut]], #4 \n\t" + "ld1r {v17.4s}, [%[clut]] \n\t" + "ldr q18, =0x10000000200000004000000080000000 \n\t" + "ldr q19, =0x01000000020000000400000008000000 \n\t" + : /* Outputs */ + [clut]"+r"(clut) + ); + + do { + uint32_t *s = src; + /* Aim to end the inner loop at a 64-bit boundary in destination image. */ + uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63; + uint32_t remain = width - trailing_dest_bits / destBPP; + /* The inner loop works in chunks of 256 bits at the destination image. */ + /* Find the number of pixels that precede the first iteration of the inner loop. */ + uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP; + /* We define skew as the number of bits into the first or only source image + * word (counting from the MS bit) corresponding to the first pixel of an + * iteration of the inner loop. Where the ratio between source and destination + * depths is greater than 8, this number will change between iterations. */ + uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31; + uint32_t unskew = (-skew) & 31; + + /* Handle leading pixels, if any */ + uint32_t data, carry; + if (leading_src_bits == 0) { + if (skew == 0) + carry = 0; + else + carry = *s++ << skew; + __asm__ volatile ( + "mov v4.s[1], %w[carry] \n\t" + : /* Outputs */ + : /* Inputs */ + [carry]"r"(carry) + ); + } else { + if (skew == 0) { + data = *s++; + carry = 0; + } + else if (leading_src_bits > skew) { + data = *s++ << skew; + carry = *s++; + data |= carry >> unskew; + carry <<= skew; + } else { + carry = *s++; + data = carry >> unskew; + carry <<= skew; + } + data <<= 32 - leading_src_bits; + + __asm__ volatile ( + "mov v4.s[0], %w[data] \n\t" + "mov v4.s[1], %w[carry] \n\t" + + /* Process 1..7 pixels */ + "dup v3.4s, v4.s[0] \n\t" + "cmtst v0.4s, v3.4s, v18.4s \n\t" + "cmtst v1.4s, v3.4s, v19.4s \n\t" + "bsl v0.16b, v17.16b, v16.16b \n\t" + "bsl v1.16b, v17.16b, v16.16b \n\t" + "tst %[bits], #4 \n\t" + "b.eq 1f \n\t" + "str q0, [%[dest]], #16 \n\t" + "mov v0.16b, v1.16b \n\t" + "1: \n\t" + "tst %[bits], #2 \n\t" + "b.eq 1f \n\t" + "str d0, [%[dest]], #8 \n\t" + "mov v0.d[0], v0.d[1] \n\t" + "1: \n\t" + "tst %[bits], #1 \n\t" + "b.eq 1f \n\t" + "str s0, [%[dest]], #4 \n\t" + "1: \n\t" + : /* Outputs */ + [dest]"+r"(dest) + : /* Inputs */ + [data]"r"(data), + [carry]"r"(carry), + [bits]"r"(leading_src_bits) + : /* Clobbers */ + "memory", "cc" + ); + } + + uint32_t src_bits = srcBPP*256/destBPP; + uint32_t unskew2 = -(unskew & (src_bits - 1)); + uint32_t skew2 = src_bits + unskew2; + __asm__ volatile ( + "mov v5.s[0], %w[unskew2] \n\t" + "mov v5.s[1], %w[skew2] \n\t" + : /* Outputs */ + : /* Inputs */ + [unskew2]"r"(unskew2), + [skew2]"r"(skew2) + ); + + /* Inner loop */ + for (; remain >= 256/destBPP; remain -= 256/destBPP) { + __asm__ volatile ( + /* Load source word (if necessary) and shuffle */ + "cmp %[unskew], %[src_bits] \n\t" + "b.hs 1f \n\t" + "ushr d0, d4, #32 \n\t" + "ld1r {v4.2s}, [%[s]], #4 \n\t" + "ushl v4.2s, v4.2s, v5.2s \n\t" + "orr v4.8b, v4.8b, v0.8b \n\t" + "b 2f \n\t" + "1: \n\t" + "mul v4.2s, v6.2s, v4.s[1] \n\t" + "2: \n\t" + + /* Process 8 pixels */ + "dup v3.4s, v4.s[0] \n\t" + "cmtst v0.4s, v3.4s, v18.4s \n\t" + "cmtst v1.4s, v3.4s, v19.4s \n\t" + "bsl v0.16b, v17.16b, v16.16b \n\t" + "bsl v1.16b, v17.16b, v16.16b \n\t" + "st1 {v0.16b-v1.16b}, [%[dest]], #32 \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + [unskew]"r"(unskew), + [src_bits]"I"(src_bits) + : /* Clobbers */ + "memory", "cc" + ); + unskew = (unskew - src_bits) & 31; + } + + /* Handle trailing pixel, if any */ + if (trailing_dest_bits) { + if (unskew == 0) + data = *s++; + else + __asm__ volatile ( + "mov %w[data], v4.s[1] \n\t" + : /* Outputs */ + [data]"=r"(data) + ); + *dest++ = (*cmLookupTable)[data >> 31]; + } + + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); +} + +/******************************************************************************/ + +static void fastPathSourceWord_2_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + const unsigned srcBPP = 2; + const unsigned destBPP = 32; + uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32; + uint32_t *dest = destBits + destPitch * destY + destX; + + uint32_t *clut = *cmLookupTable; + __asm__ volatile ( + "ldr d6, =0x1000000000001 \n\t" + "ld4 {v16.b-v19.b}[0], [%[clut]], #4 \n\t" + "ld4 {v16.b-v19.b}[1], [%[clut]], #4 \n\t" + "ld4 {v16.b-v19.b}[2], [%[clut]], #4 \n\t" + "ld4 {v16.b-v19.b}[3], [%[clut]] \n\t" + "ldr d20, =0x00FEFCFA00FEFCFA \n\t" + "ldr d21, =0x0303030303030303 \n\t" + : /* Outputs */ + [clut]"+r"(clut) + ); + + do { + uint32_t *s = src; + /* Aim to end the inner loop at a 64-bit boundary in destination image. */ + uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63; + uint32_t remain = width - trailing_dest_bits / destBPP; + /* The inner loop works in chunks of 256 bits at the destination image. */ + /* Find the number of pixels that precede the first iteration of the inner loop. */ + uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP; + /* We define skew as the number of bits into the first or only source image + * word (counting from the MS bit) corresponding to the first pixel of an + * iteration of the inner loop. Where the ratio between source and destination + * depths is greater than 8, this number will change between iterations. */ + uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31; + uint32_t unskew = (-skew) & 31; + + /* Handle leading pixels, if any */ + uint32_t data, carry; + if (leading_src_bits == 0) { + if (skew == 0) + carry = 0; + else + carry = *s++ << skew; + __asm__ volatile ( + "mov v4.s[1], %w[carry] \n\t" + : /* Outputs */ + : /* Inputs */ + [carry]"r"(carry) + ); + } else { + if (skew == 0) { + data = *s++; + carry = 0; + } + else if (leading_src_bits > skew) { + data = *s++ << skew; + carry = *s++; + data |= carry >> unskew; + carry <<= skew; + } else { + carry = *s++; + data = carry >> unskew; + carry <<= skew; + } + data <<= 32 - leading_src_bits; + + __asm__ volatile ( + "mov v4.s[0], %w[data] \n\t" + "mov v4.s[1], %w[carry] \n\t" + + /* Process 1..7 pixels */ + "dup v2.8b, v4.b[3] \n\t" + "dup v3.8b, v4.b[2] \n\t" + "ext v3.8b, v2.8b, v3.8b, #4 \n\t" + "ushl v3.8b, v3.8b, v20.8b \n\t" + "and v3.8b, v3.8b, v21.8b \n\t" + "tbl v0.8b, {v16.16b}, v3.8b \n\t" + "tbl v1.8b, {v17.16b}, v3.8b \n\t" + "tbl v2.8b, {v18.16b}, v3.8b \n\t" + "tbl v3.8b, {v19.16b}, v3.8b \n\t" + "zip1 v24.8b, v0.8b, v2.8b \n\t" + "zip2 v26.8b, v0.8b, v2.8b \n\t" + "zip1 v25.8b, v1.8b, v3.8b \n\t" + "zip2 v27.8b, v1.8b, v3.8b \n\t" + "zip1 v0.8b, v24.8b, v25.8b \n\t" + "zip2 v1.8b, v24.8b, v25.8b \n\t" + "zip1 v2.8b, v26.8b, v27.8b \n\t" + "zip2 v3.8b, v26.8b, v27.8b \n\t" + "tst %[bits], #8 \n\t" + "b.eq 1f \n\t" + "st1 {v0.8b-v1.8b}, [%[dest]], #16 \n\t" + "mov v0.8b, v2.8b \n\t" + "mov v1.8b, v3.8b \n\t" + "1: \n\t" + "tst %[bits], #4 \n\t" + "b.eq 1f \n\t" + "str d0, [%[dest]], #8 \n\t" + "mov v0.8b, v1.8b \n\t" + "1: \n\t" + "tst %[bits], #2 \n\t" + "b.eq 1f \n\t" + "str s0, [%[dest]], #4 \n\t" + "1: \n\t" + : /* Outputs */ + [dest]"+r"(dest) + : /* Inputs */ + [data]"r"(data), + [carry]"r"(carry), + [bits]"r"(leading_src_bits) + : /* Clobbers */ + "memory", "cc" + ); + } + + uint32_t src_bits = srcBPP*256/destBPP; + uint32_t unskew2 = -(unskew & (src_bits - 1)); + uint32_t skew2 = src_bits + unskew2; + __asm__ volatile ( + "mov v5.s[0], %w[unskew2] \n\t" + "mov v5.s[1], %w[skew2] \n\t" + : /* Outputs */ + : /* Inputs */ + [unskew2]"r"(unskew2), + [skew2]"r"(skew2) + ); + + /* Inner loop */ + for (; remain >= 256/destBPP; remain -= 256/destBPP) { + __asm__ volatile ( + /* Load source word (if necessary) and shuffle */ + "cmp %[unskew], %[src_bits] \n\t" + "b.hs 1f \n\t" + "ushr d0, d4, #32 \n\t" + "ld1r {v4.2s}, [%[s]], #4 \n\t" + "ushl v4.2s, v4.2s, v5.2s \n\t" + "orr v4.8b, v4.8b, v0.8b \n\t" + "b 2f \n\t" + "1: \n\t" + "mul v4.2s, v6.2s, v4.s[1] \n\t" + "2: \n\t" + + /* Process 8 pixels */ + "dup v2.8b, v4.b[3] \n\t" + "dup v3.8b, v4.b[2] \n\t" + "ext v3.8b, v2.8b, v3.8b, #4 \n\t" + "ushl v3.8b, v3.8b, v20.8b \n\t" + "and v3.8b, v3.8b, v21.8b \n\t" + "tbl v0.8b, {v16.16b}, v3.8b \n\t" + "tbl v1.8b, {v17.16b}, v3.8b \n\t" + "tbl v2.8b, {v18.16b}, v3.8b \n\t" + "tbl v3.8b, {v19.16b}, v3.8b \n\t" + "st4 {v0.8b-v3.8b}, [%[dest]], #32 \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + [unskew]"r"(unskew), + [src_bits]"I"(src_bits) + : /* Clobbers */ + "memory", "cc" + ); + unskew = (unskew - src_bits) & 31; + } + + /* Handle trailing pixel, if any */ + if (trailing_dest_bits) { + if (unskew == 0) + data = *s++; + else + __asm__ volatile ( + "mov %w[data], v4.s[1] \n\t" + : /* Outputs */ + [data]"=r"(data) + ); + *dest++ = (*cmLookupTable)[data >> 30]; + } + + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); +} + +/******************************************************************************/ + +static void fastPathSourceWord_4_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + const unsigned srcBPP = 4; + const unsigned destBPP = 32; + uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32; + uint32_t *dest = destBits + destPitch * destY + destX; + + uint32_t *clut = *cmLookupTable; + __asm__ volatile ( + "ld4 {v16.16b-v19.16b}, [%[clut]] \n\t" + "ldr s20, =0x10101010 \n\t" + "ldr d21, =0x00FC00FC00FC00FC \n\t" + : /* Outputs */ + [clut]"+r"(clut) + ); + + do { + uint32_t *s = src; + /* Aim to end the inner loop at a 64-bit boundary in destination image. */ + uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63; + uint32_t remain = width - trailing_dest_bits / destBPP; + /* The inner loop works in chunks of 256 bits at the destination image. */ + /* Find the number of pixels that precede the first iteration of the inner loop. */ + uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP; + /* We define skew as the number of bits into the first or only source image + * word (counting from the MS bit) corresponding to the first pixel of an + * iteration of the inner loop. Where the ratio between source and destination + * depths is greater than 8, this number will change between iterations. */ + uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31; + uint32_t unskew = (-skew) & 31; + + /* Handle leading pixels, if any */ + uint32_t data, carry; + if (leading_src_bits == 0) { + if (skew == 0) + carry = 0; + else + carry = *s++ << skew; + __asm__ volatile ( + "mov v4.s[1], %w[carry] \n\t" + : /* Outputs */ + : /* Inputs */ + [carry]"r"(carry) + ); + } else { + if (skew == 0) { + data = *s++; + carry = 0; + } + else if (leading_src_bits > skew) { + data = *s++ << skew; + carry = *s++; + data |= carry >> unskew; + carry <<= skew; + } else { + carry = *s++; + data = carry >> unskew; + carry <<= skew; + } + data <<= 32 - leading_src_bits; + + __asm__ volatile ( + "mov v4.s[0], %w[data] \n\t" + "mov v4.s[1], %w[carry] \n\t" + + /* Process 1..7 pixels */ + "umull v3.8h, v4.8b, v20.8b \n\t" + "ushl v3.8b, v3.8b, v21.8b \n\t" + "rev64 v3.8b, v3.8b \n\t" + "tbl v0.8b, {v16.16b}, v3.8b \n\t" + "tbl v1.8b, {v17.16b}, v3.8b \n\t" + "tbl v2.8b, {v18.16b}, v3.8b \n\t" + "tbl v3.8b, {v19.16b}, v3.8b \n\t" + "zip1 v24.8b, v0.8b, v2.8b \n\t" + "zip2 v26.8b, v0.8b, v2.8b \n\t" + "zip1 v25.8b, v1.8b, v3.8b \n\t" + "zip2 v27.8b, v1.8b, v3.8b \n\t" + "zip1 v0.8b, v24.8b, v25.8b \n\t" + "zip2 v1.8b, v24.8b, v25.8b \n\t" + "zip1 v2.8b, v26.8b, v27.8b \n\t" + "zip2 v3.8b, v26.8b, v27.8b \n\t" + "tst %[bits], #16 \n\t" + "b.eq 1f \n\t" + "st1 {v0.8b-v1.8b}, [%[dest]], #16 \n\t" + "mov v0.8b, v2.8b \n\t" + "mov v1.8b, v3.8b \n\t" + "1: \n\t" + "tst %[bits], #8 \n\t" + "b.eq 1f \n\t" + "str d0, [%[dest]], #8 \n\t" + "mov v0.8b, v1.8b \n\t" + "1: \n\t" + "tst %[bits], #4 \n\t" + "b.eq 1f \n\t" + "str s0, [%[dest]], #4 \n\t" + "1: \n\t" + : /* Outputs */ + [dest]"+r"(dest) + : /* Inputs */ + [data]"r"(data), + [carry]"r"(carry), + [bits]"r"(leading_src_bits) + : /* Clobbers */ + "memory", "cc" + ); + } + + uint32_t src_bits = srcBPP*256/destBPP; + uint32_t unskew2 = -(unskew & (src_bits - 1)); + uint32_t skew2 = src_bits + unskew2; + __asm__ volatile ( + "mov v5.s[0], %w[unskew2] \n\t" + "mov v5.s[1], %w[skew2] \n\t" + : /* Outputs */ + : /* Inputs */ + [unskew2]"r"(unskew2), + [skew2]"r"(skew2) + ); + + /* Inner loop */ + for (; remain >= 256/destBPP; remain -= 256/destBPP) { + __asm__ volatile ( + /* Load source word and shuffle */ + "ushr d0, d4, #32 \n\t" + "ld1r {v4.2s}, [%[s]], #4 \n\t" + "ushl v4.2s, v4.2s, v5.2s \n\t" + "orr v4.8b, v4.8b, v0.8b \n\t" + + /* Process 8 pixels */ + "umull v3.8h, v4.8b, v20.8b \n\t" + "ushl v3.8b, v3.8b, v21.8b \n\t" + "rev64 v3.8b, v3.8b \n\t" + "tbl v0.8b, {v16.16b}, v3.8b \n\t" + "tbl v1.8b, {v17.16b}, v3.8b \n\t" + "tbl v2.8b, {v18.16b}, v3.8b \n\t" + "tbl v3.8b, {v19.16b}, v3.8b \n\t" + "st4 {v0.8b-v3.8b}, [%[dest]], #32 \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + [unskew]"r"(unskew) + : /* Clobbers */ + "memory", "cc" + ); + } + + /* Handle trailing pixel, if any */ + if (trailing_dest_bits) { + if (unskew == 0) + data = *s++; + else + __asm__ volatile ( + "mov %w[data], v4.s[1] \n\t" + : /* Outputs */ + [data]"=r"(data) + ); + *dest++ = (*cmLookupTable)[data >> 28]; + } + + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); +} + +/******************************************************************************/ + +static void fastPathSourceWord_16_32(operation_t *op, uint32_t flags) +{ + IGNORE(flags); + COPY_OP_TO_LOCALS(op, uint32_t, uint32_t); + const unsigned srcBPP = 16; + const unsigned destBPP = 32; + uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32; + uint32_t *dest = destBits + destPitch * destY + destX; + + __asm__ volatile ( + "ldr q16, =0x00F800F800F800F800F800F800F800F8 \n\t" + "ldr q17, =0xF800F800F800F800F800F800F800F800 \n\t" + "ldr q18, =0x80008000800080008000800080008000 \n\t" + "ldr q19, =0x00010001000100010001000100010001 \n\t" + ); + + do { + uint32_t *s = src; + /* Aim to end the inner loop at a 64-bit boundary in destination image. */ + uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63; + uint32_t remain = width - trailing_dest_bits / destBPP; + /* The inner loop works in chunks of 256 bits at the destination image. */ + /* Find the number of pixels that precede the first iteration of the inner loop. */ + uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP; + /* We define skew as the number of bits into the first or only source image + * word (counting from the MS bit) corresponding to the first pixel of an + * iteration of the inner loop. Where the ratio between source and destination + * depths is greater than 8, this number will change between iterations. */ + uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31; + + if (skew == 0) { + __asm__ volatile ( + /* Load leading pixels */ + "cbz %w[bits], 10f \n\t" + "tbz %w[bits], #4, 1f \n\t" + "ld1 {v4.s}[0], [%[s]], #4 \n\t" + "1: \n\t" + "tbz %w[bits], #5, 1f \n\t" + "ld1 {v4.s}[1], [%[s]], #4 \n\t" + "1: \n\t" + "tbz %w[bits], #6, 1f \n\t" + "ld1 {v4.d}[1], [%[s]], #8 \n\t" + "1: \n\t" + "rev32 v4.8h, v4.8h \n\t" + + /* Process 1..7 pixels */ + "shl v0.8h, v4.8h, #3 \n\t" + "shl v1.8h, v4.8h, #6 \n\t" + "ushr v2.8h, v4.8h, #7 \n\t" + "cmeq v3.8h, v4.8h, v18.8h \n\t" + "and v0.16b, v0.16b, v16.16b \n\t" + "and v1.16b, v1.16b, v17.16b \n\t" + "bit v0.16b, v19.16b, v3.16b \n\t" + "and v2.16b, v2.16b, v16.16b \n\t" + "orr v3.16b, v0.16b, v1.16b \n\t" + "zip1 v0.8h, v3.8h, v2.8h \n\t" + "zip2 v1.8h, v3.8h, v2.8h \n\t" + + /* Store leading pixels */ + "tbz %w[bits], #4, 1f \n\t" + "st1 {v0.s}[1], [%[dest]], #4 \n\t" + "1: \n\t" + "tbz %w[bits], #5, 1f \n\t" + "st1 {v0.d}[1], [%[dest]], #8 \n\t" + "1: \n\t" + "tbz %w[bits], #6, 10f \n\t" + "st1 {v1.16b}, [%[dest]], #16 \n\t" + "10: \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + [bits]"r"(leading_src_bits) + : /* Clobbers */ + "memory", "cc" + ); + + /* Inner loop */ + for (; remain >= 256/destBPP; remain -= 256/destBPP) { + __asm__ volatile ( + /* Load pixels */ + "ld1 {v4.16b}, [%[s]], #16 \n\t" + "rev32 v4.8h, v4.8h \n\t" + + /* Process 8 pixels */ + "shl v0.8h, v4.8h, #3 \n\t" + "shl v1.8h, v4.8h, #6 \n\t" + "ushr v2.8h, v4.8h, #7 \n\t" + "cmeq v3.8h, v4.8h, v18.8h \n\t" + "and v0.16b, v0.16b, v16.16b \n\t" + "and v1.16b, v1.16b, v17.16b \n\t" + "bit v0.16b, v19.16b, v3.16b \n\t" + "and v2.16b, v2.16b, v16.16b \n\t" + "orr v3.16b, v0.16b, v1.16b \n\t" + "zip1 v0.8h, v3.8h, v2.8h \n\t" + "zip2 v1.8h, v3.8h, v2.8h \n\t" + + /* Store pixels */ + "st1 {v0.16b-v1.16b}, [%[dest]], #32 \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + : /* Clobbers */ + "memory", "cc" + ); + } + + /* Handle trailing pixel, if any */ + if (trailing_dest_bits) { + uint32_t in = *s; + uint32_t r = (in >> 7) & 0xF80000; + uint32_t g = (in >> 10) & 0xF800; + uint32_t b = (in >> 13) & 0xF8; + *dest++ = r | g | b | ((in >> 16) == 0x8000); + } + + } else { + __asm__ volatile ( + /* Load leading pixels */ + "ld1r {v4.4s}, [%[s]], #4 \n\t" + "tbz %w[bits], #5, 1f \n\t" + "ld1r {v5.4s}, [%[s]], #4 \n\t" + "ext v4.16b, v4.16b, v5.16b, #12 \n\t" + "1: \n\t" + "tbz %w[bits], #6, 1f \n\t" + "ld1 {v4.d}[1], [%[s]], #8 \n\t" + "1: \n\t" + "rev32 v5.8h, v4.8h \n\t" + "ext v4.16b, v5.16b, v5.16b, #14 \n\t" + "cbz %w[bits], 10f \n\t" + + /* Process 1..7 pixels */ + "shl v0.8h, v4.8h, #3 \n\t" + "shl v1.8h, v4.8h, #6 \n\t" + "ushr v2.8h, v4.8h, #7 \n\t" + "cmeq v3.8h, v4.8h, v18.8h \n\t" + "and v0.16b, v0.16b, v16.16b \n\t" + "and v1.16b, v1.16b, v17.16b \n\t" + "bit v0.16b, v19.16b, v3.16b \n\t" + "and v2.16b, v2.16b, v16.16b \n\t" + "orr v3.16b, v0.16b, v1.16b \n\t" + "zip1 v0.8h, v3.8h, v2.8h \n\t" + "zip2 v1.8h, v3.8h, v2.8h \n\t" + + /* Store leading pixels */ + "tbz %w[bits], #4, 1f \n\t" + "st1 {v0.s}[1], [%[dest]], #4 \n\t" + "1: \n\t" + "tbz %w[bits], #5, 1f \n\t" + "st1 {v0.d}[1], [%[dest]], #8 \n\t" + "1: \n\t" + "tbz %w[bits], #6, 10f \n\t" + "st1 {v1.16b}, [%[dest]], #16 \n\t" + "10: \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + [skew]"r"(skew), + [bits]"r"(leading_src_bits) + : /* Clobbers */ + "memory", "cc" + ); + + /* Inner loop */ + for (; remain >= 256/destBPP; remain -= 256/destBPP) { + __asm__ volatile ( + /* Load pixels */ + "mov v4.16b, v5.16b \n\t" + "ld1 {v5.16b}, [%[s]], #16 \n\t" + "rev32 v5.8h, v5.8h \n\t" + "ext v4.16b, v4.16b, v5.16b, #14 \n\t" + + /* Process 8 pixels */ + "shl v0.8h, v4.8h, #3 \n\t" + "shl v1.8h, v4.8h, #6 \n\t" + "ushr v2.8h, v4.8h, #7 \n\t" + "cmeq v3.8h, v4.8h, v18.8h \n\t" + "and v0.16b, v0.16b, v16.16b \n\t" + "and v1.16b, v1.16b, v17.16b \n\t" + "bit v0.16b, v19.16b, v3.16b \n\t" + "and v2.16b, v2.16b, v16.16b \n\t" + "orr v3.16b, v0.16b, v1.16b \n\t" + "zip1 v0.8h, v3.8h, v2.8h \n\t" + "zip2 v1.8h, v3.8h, v2.8h \n\t" + + /* Store pixels */ + "st1 {v0.16b-v1.16b}, [%[dest]], #32 \n\t" + : /* Outputs */ + [s]"+r"(s), + [dest]"+r"(dest) + : /* Inputs */ + : /* Clobbers */ + "memory", "cc" + ); + } + + /* Handle trailing pixel, if any */ + if (trailing_dest_bits) { + uint32_t in; + __asm__ volatile ( + "mov %w[in], v5.s[3] \n\t" + : /* Outputs */ + [in]"=r"(in) + ); + uint32_t r = (in >> 7) & 0xF80000; + uint32_t g = (in >> 10) & 0xF800; + uint32_t b = (in >> 13) & 0xF8; + *dest++ = r | g | b | ((in >> 16) == 0x8000); + } + + } + + src += srcPitch; + dest += destPitch - width; + } while (--height > 0); +} + +/******************************************************************************/ + +#define AlphaBlend_scalar_0_32_DECLARE_TEMPORARIES + +#define AlphaBlend_scalar_0_32_PASS_TEMPORARIES + +#define AlphaBlend_scalar_0_32_INIT \ + "ld4r {v20.16b-v23.16b}, [%[halftone]] \n\t" \ + "movi v18.16b, #0xFF \n\t" \ + "mvn v19.16b, v23.16b \n\t" \ + "umull v20.8h, v20.8b, v23.8b \n\t" \ + "umull v21.8h, v21.8b, v23.8b \n\t" \ + "umull v22.8h, v22.8b, v23.8b \n\t" \ + "umull v23.8h, v18.8b, v23.8b \n\t" \ + +#define AlphaBlend_scalar_0_32_32 \ + "ld4 {v4.b-v7.b}[0], [%[dest]] \n\t" \ + AlphaBlend_scalar_0_32_256 \ + "st4 {v0.b-v3.b}[0], [%[dest]], #4 \n\t" \ + +#define AlphaBlend_scalar_0_32_256 \ + "umull v24.8h, v4.8b, v19.8b \n\t" \ + "umull v25.8h, v5.8b, v19.8b \n\t" \ + "umull v26.8h, v6.8b, v19.8b \n\t" \ + "umull v27.8h, v7.8b, v19.8b \n\t" \ + "add v24.8h, v24.8h, v20.8h \n\t" \ + "add v25.8h, v25.8h, v21.8h \n\t" \ + "add v26.8h, v26.8h, v22.8h \n\t" \ + "add v27.8h, v27.8h, v23.8h \n\t" \ + "cmtst v0.8h, v24.8h, v18.8h \n\t" \ + "cmtst v1.8h, v25.8h, v18.8h \n\t" \ + "cmtst v2.8h, v26.8h, v18.8h \n\t" \ + "cmtst v3.8h, v27.8h, v18.8h \n\t" \ + "usra v24.8h, v24.8h, #8 \n\t" \ + "usra v25.8h, v25.8h, #8 \n\t" \ + "usra v26.8h, v26.8h, #8 \n\t" \ + "usra v27.8h, v27.8h, #8 \n\t" \ + "sub v0.16b, v24.16b, v0.16b \n\t" \ + "sub v1.16b, v25.16b, v1.16b \n\t" \ + "sub v2.16b, v26.16b, v2.16b \n\t" \ + "sub v3.16b, v27.16b, v3.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v1.8b, v1.8h, #8 \n\t" \ + "shrn v2.8b, v2.8h, #8 \n\t" \ + "shrn v3.8b, v3.8h, #8 \n\t" \ + +#define AlphaBlend_scalar_0_32_512 \ + "umull v24.8h, v4.8b, v19.8b \n\t" \ + "umull2 v25.8h, v4.16b, v19.16b \n\t" \ + "umull v26.8h, v5.8b, v19.8b \n\t" \ + "umull2 v27.8h, v5.16b, v19.16b \n\t" \ + "umull v28.8h, v6.8b, v19.8b \n\t" \ + "umull2 v29.8h, v6.16b, v19.16b \n\t" \ + "umull v30.8h, v7.8b, v19.8b \n\t" \ + "umull2 v31.8h, v7.16b, v19.16b \n\t" \ + "add v24.8h, v24.8h, v20.8h \n\t" \ + "add v25.8h, v25.8h, v20.8h \n\t" \ + "add v26.8h, v26.8h, v21.8h \n\t" \ + "add v27.8h, v27.8h, v21.8h \n\t" \ + "add v28.8h, v28.8h, v22.8h \n\t" \ + "add v29.8h, v29.8h, v22.8h \n\t" \ + "add v30.8h, v30.8h, v23.8h \n\t" \ + "add v31.8h, v31.8h, v23.8h \n\t" \ + "cmtst v0.8h, v24.8h, v18.8h \n\t" \ + "cmtst v1.8h, v25.8h, v18.8h \n\t" \ + "cmtst v2.8h, v26.8h, v18.8h \n\t" \ + "cmtst v3.8h, v27.8h, v18.8h \n\t" \ + "cmtst v4.8h, v28.8h, v18.8h \n\t" \ + "cmtst v5.8h, v29.8h, v18.8h \n\t" \ + "cmtst v6.8h, v30.8h, v18.8h \n\t" \ + "cmtst v7.8h, v31.8h, v18.8h \n\t" \ + "usra v24.8h, v24.8h, #8 \n\t" \ + "usra v25.8h, v25.8h, #8 \n\t" \ + "usra v26.8h, v26.8h, #8 \n\t" \ + "usra v27.8h, v27.8h, #8 \n\t" \ + "usra v28.8h, v28.8h, #8 \n\t" \ + "usra v29.8h, v29.8h, #8 \n\t" \ + "usra v30.8h, v30.8h, #8 \n\t" \ + "usra v31.8h, v31.8h, #8 \n\t" \ + "sub v0.16b, v24.16b, v0.16b \n\t" \ + "sub v1.16b, v25.16b, v1.16b \n\t" \ + "sub v2.16b, v26.16b, v2.16b \n\t" \ + "sub v3.16b, v27.16b, v3.16b \n\t" \ + "sub v4.16b, v28.16b, v4.16b \n\t" \ + "sub v5.16b, v29.16b, v5.16b \n\t" \ + "sub v6.16b, v30.16b, v6.16b \n\t" \ + "sub v7.16b, v31.16b, v7.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v24.8b, v1.8h, #8 \n\t" \ + "shrn v1.8b, v2.8h, #8 \n\t" \ + "shrn v25.8b, v3.8h, #8 \n\t" \ + "shrn v2.8b, v4.8h, #8 \n\t" \ + "shrn v26.8b, v5.8h, #8 \n\t" \ + "shrn v3.8b, v6.8h, #8 \n\t" \ + "shrn v27.8b, v7.8h, #8 \n\t" \ + "mov v0.d[1], v24.d[0] \n\t" \ + "mov v1.d[1], v25.d[0] \n\t" \ + "mov v2.d[1], v26.d[0] \n\t" \ + "mov v3.d[1], v27.d[0] \n\t" \ + +DEFINE_FAST_PATH(AlphaBlend_scalar, 0_32, YES, YES) + +/******************************************************************************/ + +#define AlphaBlend_map1_scalar_32_32_DECLARE_STACK uint64_t save[3] + +#define AlphaBlend_map1_scalar_32_32_PASS_STACK , [save]"r"(save) + +#define AlphaBlend_map1_scalar_32_32_DECLARE_TEMPORARIES + +#define AlphaBlend_map1_scalar_32_32_PASS_TEMPORARIES + +#define AlphaBlend_map1_scalar_32_32_INIT \ + "str d13, [%[save], #0] \n\t" \ + "stp d14, d15, [%[save], #8] \n\t" \ + "ld4r {v24.16b-v27.16b}, [%[clut]], #4 \n\t" \ + "ld4r {v28.16b-v31.16b}, [%[clut]] \n\t" \ + "ld4r {v16.16b-v19.16b}, [%[halftone]] \n\t" \ + "movi v13.16b, #0xFF \n\t" \ + "and v24.8b, v24.8b, v16.8b \n\t" \ + "and v25.8b, v25.8b, v17.8b \n\t" \ + "and v26.8b, v26.8b, v18.8b \n\t" \ + "and v27.16b, v27.16b, v19.16b \n\t" \ + "and v28.8b, v28.8b, v16.8b \n\t" \ + "and v29.8b, v29.8b, v17.8b \n\t" \ + "and v30.8b, v30.8b, v18.8b \n\t" \ + "and v31.16b, v31.16b, v19.16b \n\t" \ + "mvn v14.16b, v27.16b \n\t" \ + "mvn v15.16b, v31.16b \n\t" \ + "umull v24.8h, v24.8b, v27.8b \n\t" \ + "umull v25.8h, v25.8b, v27.8b \n\t" \ + "umull v26.8h, v26.8b, v27.8b \n\t" \ + "umull v27.8h, v13.8b, v27.8b \n\t" \ + "umull v28.8h, v28.8b, v31.8b \n\t" \ + "umull v29.8h, v29.8b, v31.8b \n\t" \ + "umull v30.8h, v30.8b, v31.8b \n\t" \ + "umull v31.8h, v13.8b, v31.8b \n\t" \ + +#define AlphaBlend_map1_scalar_32_32_FINAL \ + "ldr d13, [%[save], #0] \n\t" \ + "ldp d14, d15, [%[save], #8] \n\t" \ + +#define AlphaBlend_map1_scalar_32_32_32 \ + "ldr s1, [%[src]], #4 \n\t" \ + "ld4 {v4.b-v7.b}[0], [%[dest]] \n\t" \ + "cmtst v0.2s, v1.2s, v13.2s \n\t" \ + "cmtst v16.2s, v1.2s, v13.2s \n\t" \ + "cmtst v17.2s, v1.2s, v13.2s \n\t" \ + "cmtst v18.2s, v1.2s, v13.2s \n\t" \ + "cmtst v19.2s, v1.2s, v13.2s \n\t" \ + "bsl v0.8b, v15.8b, v14.8b \n\t" \ + "bsl v16.8b, v28.8b, v24.8b \n\t" \ + "bsl v17.8b, v29.8b, v25.8b \n\t" \ + "bsl v18.8b, v30.8b, v26.8b \n\t" \ + "bsl v19.8b, v31.8b, v27.8b \n\t" \ + "umlal v16.8h, v4.8b, v0.8b \n\t" \ + "umlal v17.8h, v5.8b, v0.8b \n\t" \ + "umlal v18.8h, v6.8b, v0.8b \n\t" \ + "umlal v19.8h, v7.8b, v0.8b \n\t" \ + "cmtst v0.4h, v16.4h, v13.4h \n\t" \ + "cmtst v1.4h, v17.4h, v13.4h \n\t" \ + "cmtst v2.4h, v18.4h, v13.4h \n\t" \ + "cmtst v3.4h, v19.4h, v13.4h \n\t" \ + "usra v16.4h, v16.4h, #8 \n\t" \ + "usra v17.4h, v17.4h, #8 \n\t" \ + "usra v18.4h, v18.4h, #8 \n\t" \ + "usra v19.4h, v19.4h, #8 \n\t" \ + "sub v0.8b, v16.8b, v0.8b \n\t" \ + "sub v1.8b, v17.8b, v1.8b \n\t" \ + "sub v2.8b, v18.8b, v2.8b \n\t" \ + "sub v3.8b, v19.8b, v3.8b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v1.8b, v1.8h, #8 \n\t" \ + "shrn v2.8b, v2.8h, #8 \n\t" \ + "shrn v3.8b, v3.8h, #8 \n\t" \ + "st4 {v0.b-v3.b}[0], [%[dest]], #4 \n\t" \ + +#define AlphaBlend_map1_scalar_32_32_256 \ + "orr v0.8b, v0.8b, v1.8b \n\t" \ + "orr v2.8b, v2.8b, v3.8b \n\t" \ + "orr v0.8b, v0.8b, v2.8b \n\t" \ + "cmtst v0.8b, v0.8b, v13.8b \n\t" \ + "sxtl v16.8h, v0.8b \n\t" \ + "sxtl v17.8h, v0.8b \n\t" \ + "sxtl v18.8h, v0.8b \n\t" \ + "sxtl v19.8h, v0.8b \n\t" \ + "bsl v0.8b, v15.8b, v14.8b \n\t" \ + "bsl v16.16b, v28.16b, v24.16b \n\t" \ + "bsl v17.16b, v29.16b, v25.16b \n\t" \ + "bsl v18.16b, v30.16b, v26.16b \n\t" \ + "bsl v19.16b, v31.16b, v27.16b \n\t" \ + "umlal v16.8h, v4.8b, v0.8b \n\t" \ + "umlal v17.8h, v5.8b, v0.8b \n\t" \ + "umlal v18.8h, v6.8b, v0.8b \n\t" \ + "umlal v19.8h, v7.8b, v0.8b \n\t" \ + "cmtst v0.8h, v16.8h, v13.8h \n\t" \ + "cmtst v1.8h, v17.8h, v13.8h \n\t" \ + "cmtst v2.8h, v18.8h, v13.8h \n\t" \ + "cmtst v3.8h, v19.8h, v13.8h \n\t" \ + "usra v16.8h, v16.8h, #8 \n\t" \ + "usra v17.8h, v17.8h, #8 \n\t" \ + "usra v18.8h, v18.8h, #8 \n\t" \ + "usra v19.8h, v19.8h, #8 \n\t" \ + "sub v0.16b, v16.16b, v0.16b \n\t" \ + "sub v1.16b, v17.16b, v1.16b \n\t" \ + "sub v2.16b, v18.16b, v2.16b \n\t" \ + "sub v3.16b, v19.16b, v3.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v1.8b, v1.8h, #8 \n\t" \ + "shrn v2.8b, v2.8h, #8 \n\t" \ + "shrn v3.8b, v3.8h, #8 \n\t" \ + +#define AlphaBlend_map1_scalar_32_32_512 \ + "orr v0.16b, v0.16b, v1.16b \n\t" \ + "orr v2.16b, v2.16b, v3.16b \n\t" \ + "orr v0.16b, v0.16b, v2.16b \n\t" \ + "cmtst v0.16b, v0.16b, v13.16b \n\t" \ + "sxtl v16.8h, v0.8b \n\t" \ + "sxtl2 v17.8h, v0.16b \n\t" \ + "sxtl v18.8h, v0.8b \n\t" \ + "sxtl2 v19.8h, v0.16b \n\t" \ + "sxtl v20.8h, v0.8b \n\t" \ + "sxtl2 v21.8h, v0.16b \n\t" \ + "sxtl v22.8h, v0.8b \n\t" \ + "sxtl2 v23.8h, v0.16b \n\t" \ + "bsl v0.16b, v15.16b, v14.16b \n\t" \ + "bsl v16.16b, v28.16b, v24.16b \n\t" \ + "bsl v17.16b, v28.16b, v24.16b \n\t" \ + "bsl v18.16b, v29.16b, v25.16b \n\t" \ + "bsl v19.16b, v29.16b, v25.16b \n\t" \ + "bsl v20.16b, v30.16b, v26.16b \n\t" \ + "bsl v21.16b, v30.16b, v26.16b \n\t" \ + "bsl v22.16b, v31.16b, v27.16b \n\t" \ + "bsl v23.16b, v31.16b, v27.16b \n\t" \ + "umlal v16.8h, v4.8b, v0.8b \n\t" \ + "umlal2 v17.8h, v4.16b, v0.16b \n\t" \ + "umlal v18.8h, v5.8b, v0.8b \n\t" \ + "umlal2 v19.8h, v5.16b, v0.16b \n\t" \ + "umlal v20.8h, v6.8b, v0.8b \n\t" \ + "umlal2 v21.8h, v6.16b, v0.16b \n\t" \ + "umlal v22.8h, v7.8b, v0.8b \n\t" \ + "umlal2 v23.8h, v7.16b, v0.16b \n\t" \ + "cmtst v0.8h, v16.8h, v13.8h \n\t" \ + "cmtst v1.8h, v17.8h, v13.8h \n\t" \ + "cmtst v2.8h, v18.8h, v13.8h \n\t" \ + "cmtst v3.8h, v19.8h, v13.8h \n\t" \ + "cmtst v4.8h, v20.8h, v13.8h \n\t" \ + "cmtst v5.8h, v21.8h, v13.8h \n\t" \ + "cmtst v6.8h, v22.8h, v13.8h \n\t" \ + "cmtst v7.8h, v23.8h, v13.8h \n\t" \ + "usra v16.8h, v16.8h, #8 \n\t" \ + "usra v17.8h, v17.8h, #8 \n\t" \ + "usra v18.8h, v18.8h, #8 \n\t" \ + "usra v19.8h, v19.8h, #8 \n\t" \ + "usra v20.8h, v20.8h, #8 \n\t" \ + "usra v21.8h, v21.8h, #8 \n\t" \ + "usra v22.8h, v22.8h, #8 \n\t" \ + "usra v23.8h, v23.8h, #8 \n\t" \ + "sub v0.16b, v16.16b, v0.16b \n\t" \ + "sub v1.16b, v17.16b, v1.16b \n\t" \ + "sub v2.16b, v18.16b, v2.16b \n\t" \ + "sub v3.16b, v19.16b, v3.16b \n\t" \ + "sub v4.16b, v20.16b, v4.16b \n\t" \ + "sub v5.16b, v21.16b, v5.16b \n\t" \ + "sub v6.16b, v22.16b, v6.16b \n\t" \ + "sub v7.16b, v23.16b, v7.16b \n\t" \ + "shrn v0.8b, v0.8h, #8 \n\t" \ + "shrn v16.8b, v1.8h, #8 \n\t" \ + "shrn v1.8b, v2.8h, #8 \n\t" \ + "shrn v17.8b, v3.8h, #8 \n\t" \ + "shrn v2.8b, v4.8h, #8 \n\t" \ + "shrn v18.8b, v5.8h, #8 \n\t" \ + "shrn v3.8b, v6.8h, #8 \n\t" \ + "shrn v19.8b, v7.8h, #8 \n\t" \ + "mov v0.d[1], v16.d[0] \n\t" \ + "mov v1.d[1], v17.d[0] \n\t" \ + "mov v2.d[1], v18.d[0] \n\t" \ + "mov v3.d[1], v19.d[0] \n\t" \ + +DEFINE_FAST_PATH(AlphaBlend_map1_scalar, 32_32, YES, YES) + +/******************************************************************************/ + +static fast_path_t fastPaths[] = { + { fastPathAlphaBlend_32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, + { fastPathPixPaint_32_32, CR_pixPaint, STD_FLAGS(32,32,NO,NO) }, + + { fastPathSourceWord_1_32, CR_sourceWord, STD_FLAGS(1,32,DIRECT,NO) }, + { fastPathSourceWord_2_32, CR_sourceWord, STD_FLAGS(2,32,DIRECT,NO) }, + { fastPathSourceWord_4_32, CR_sourceWord, STD_FLAGS(4,32,DIRECT,NO) }, + { fastPathSourceWord_16_32, CR_sourceWord, STD_FLAGS(16,32,NO,NO) }, + + { fastPathAlphaBlend_scalar_0_32, CR_alphaBlend, STD_FLAGS_NO_SOURCE(32,SCALAR) }, + { fastPathAlphaBlend_map1_scalar_32_32, CR_alphaBlend, STD_FLAGS(32,32,1BIT,SCALAR) }, + + { fastPathBitAnd_32_32, CR_bitAnd, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitAndInvert_32_32, CR_bitAndInvert, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertAnd_32_32, CR_bitInvertAnd, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitXor_32_32, CR_bitXor, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitOr_32_32, CR_bitOr, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertAndInvert_32_32, CR_bitInvertAndInvert, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertXor_32_32, CR_bitInvertXor, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertDestination_0_32, CR_bitInvertDestination, STD_FLAGS_NO_SOURCE(32,NO) }, + { fastPathBitOrInvert_32_32, CR_bitOrInvert, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertSource_32_32, CR_bitInvertSource, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertOr_32_32, CR_bitInvertOr, STD_FLAGS(32,32,NO,NO) }, + { fastPathBitInvertOrInvert_32_32, CR_bitInvertOrInvert, STD_FLAGS(32,32,NO,NO) }, + { fastPathAddWord_32_32, CR_addWord, STD_FLAGS(32,32,NO,NO) }, + { fastPathSubWord_32_32, CR_subWord, STD_FLAGS(32,32,NO,NO) }, + { fastPathRgbAdd_32_32, CR_rgbAdd, STD_FLAGS(32,32,NO,NO) }, + { fastPathRgbSub_32_32, CR_rgbSub, STD_FLAGS(32,32,NO,NO) }, + { fastPathPixMask_32_32, CR_pixMask, STD_FLAGS(32,32,NO,NO) }, + { fastPathRgbMax_32_32, CR_rgbMax, STD_FLAGS(32,32,NO,NO) }, + { fastPathRgbMin_32_32, CR_rgbMin, STD_FLAGS(32,32,NO,NO) }, + { fastPathRgbMinInvert_32_32, CR_rgbMinInvert, STD_FLAGS(32,32,NO,NO) }, + { fastPathAlphaBlendConst_32_32, CR_alphaBlendConst, STD_FLAGS(32,32,NO,NO) }, +}; + +void addArm64FastPaths(void) +{ + addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths); +} diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h new file mode 100644 index 0000000000..afc73251bc --- /dev/null +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h @@ -0,0 +1,30 @@ +/* + * Copyright © 2021 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#ifndef BITBLTARM64_H_ +#define BITBLTARM64_H_ + +void addArm64FastPaths(void); + +#endif /* BITBLTARM64_H_ */ diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c index 10a762be43..a2e0aa32ab 100644 --- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c +++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c @@ -32,6 +32,7 @@ #include "BitBltDispatch.h" #include "BitBltArm.h" +#include "BitBltArm64.h" #include "BitBltGeneric.h" #include "BitBltInternal.h" @@ -202,7 +203,9 @@ void initialiseCopyBits(void) #ifdef __arm__ addArmFastPaths(); #endif - +#ifdef __aarch64__ + addArm64FastPaths(); +#endif } void addFastPaths(fast_path_t *paths, size_t n) @@ -315,6 +318,55 @@ void copyBitsDispatch(operation_t *op) * each pixel has value 0 or not (note that non-zero pixels that * reduce to 0 when bit-packed use lookup index 1) */ +#ifdef __aarch64__ + if (op->cmMask >= 15 && (op->cmMask & (op->cmMask + 1)) == 0) { + uint32_t value = (*op->cmLookupTable)[1]; + uint32_t *ptr = (*op->cmLookupTable) + 2; + uint32_t *end = (*op->cmLookupTable) + 16; + for (; ptr != end; ++ptr) + if (*ptr != value) + break; + if (ptr == end) { + uint64_t result0, result1; + end += op->cmMask + 1 - 16; + __asm__ volatile ( + "dup v16.4s, %w[value] \n\t" + "movi v4.16b, #0 \n\t" + "movi v5.16b, #0 \n\t" + "movi v6.16b, #0 \n\t" + "movi v7.16b, #0 \n\t" + "1: \n\t" + "ld1 {v0.16b-v3.16b}, [%[ptr]], #64 \n\t" + "eor v0.16b, v0.16b, v16.16b \n\t" + "eor v1.16b, v1.16b, v16.16b \n\t" + "eor v2.16b, v2.16b, v16.16b \n\t" + "eor v3.16b, v3.16b, v16.16b \n\t" + "orr v4.16b, v4.16b, v0.16b \n\t" + "orr v5.16b, v5.16b, v1.16b \n\t" + "orr v6.16b, v6.16b, v2.16b \n\t" + "orr v7.16b, v7.16b, v3.16b \n\t" + "cmp %[ptr], %[end] \n\t" + "b.ne 1b \n\t" + "orr v4.16b, v4.16b, v5.16b \n\t" + "orr v6.16b, v6.16b, v7.16b \n\t" + "orr v4.16b, v4.16b, v6.16b \n\t" + "mov %[result0], v4.d[0] \n\t" + "mov %[result1], v4.d[1] \n\t" + : /* Outputs */ + [ptr]"+r"(ptr), + [result0]"=r"(result0), + [result1]"=r"(result1) + : /* Inputs */ + [value]"r"(value), + [end]"r"(end) + : /* Clobbers */ + "cc" + ); + if (result0 == 0 && result1 == 0) + flags |= FAST_PATH_1BIT_COLOR_MAP; + } + } +#else usqInt i; flags |= FAST_PATH_1BIT_COLOR_MAP; for (i = op->cmMask; i >= 2; --i) { @@ -323,6 +375,7 @@ void copyBitsDispatch(operation_t *op) break; } } +#endif if ((flags & FAST_PATH_1BIT_COLOR_MAP) == 0) { if (op->src.depth == 32) { if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0) diff --git a/platforms/unix/config/configure b/platforms/unix/config/configure index a2f424d486..4d7d0a5260 100755 --- a/platforms/unix/config/configure +++ b/platforms/unix/config/configure @@ -18401,7 +18401,7 @@ aarch64) # Check whether --enable-fast-bitblt was given. if test "${enable_fast_bitblt+set}" = set; then : enableval=$enable_fast_bitblt; if test "x$enableval" = "xyes" ; then - bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o" + bitblt_objs="BitBltPlugin.o BitBltArm64.o BitBltDispatch.o BitBltGeneric.o" bitblt_flags="-DENABLE_FAST_BLT" fi diff --git a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 b/platforms/unix/plugins/BitBltPlugin/acinclude.m4 index 146b16acee..f7f2e51f2c 100644 --- a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 +++ b/platforms/unix/plugins/BitBltPlugin/acinclude.m4 @@ -22,7 +22,7 @@ aarch64) AC_ARG_ENABLE(fast-bitblt, [ --enable-fast-bitblt enable fast BitBlt optimizations (default=no)], [ if test "x$enableval" = "xyes" ; then - bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o" + bitblt_objs="BitBltPlugin.o BitBltArm64.o BitBltDispatch.o BitBltGeneric.o" bitblt_flags="-DENABLE_FAST_BLT" fi ],