From 9ebc2455224fd283b692654aefc3c9262dfeb3f2 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 24 Mar 2021 14:15:56 +0000
Subject: [PATCH 01/17] Don't pass -m32 to GCC for ARM builds

---
 build.linux32ARMv6/squeak.cog.spur/build.assert/mvm | 2 +-
 build.linux32ARMv6/squeak.cog.spur/build.debug/mvm  | 2 +-
 build.linux32ARMv6/squeak.cog.spur/build/mvm        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm b/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm
index 0254afa9f4..0e28f57503 100755
--- a/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm
+++ b/build.linux32ARMv6/squeak.cog.spur/build.assert/mvm
@@ -2,7 +2,7 @@
 set -e
 # assert VM with VM profiler and threaded heartbeat
 INSTALLDIR=assert/sqcogspurlinuxhtRPi
-OPT="-m32 -g3 -O1 -DDEBUGVM=0"
+OPT="-g3 -O1 -DDEBUGVM=0"
 
 if [ $# -ge 1 ]; then
 	INSTALLDIR="$1"; shift
diff --git a/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm b/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm
index 764e80f084..c35c5e9f6f 100755
--- a/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm
+++ b/build.linux32ARMv6/squeak.cog.spur/build.debug/mvm
@@ -2,7 +2,7 @@
 set -e
 # debug Spur VM with VM profiler and threaded heartbeat
 INSTALLDIR=debug/sqcogspurlinuxhtRPi
-OPT="-m32 -g3 -O0 -DDEBUGVM=1"
+OPT="-g3 -O0 -DDEBUGVM=1"
 
 if [ $# -ge 1 ]; then
 	INSTALLDIR="$1"; shift
diff --git a/build.linux32ARMv6/squeak.cog.spur/build/mvm b/build.linux32ARMv6/squeak.cog.spur/build/mvm
index e1fd0f57bc..f16d72208d 100755
--- a/build.linux32ARMv6/squeak.cog.spur/build/mvm
+++ b/build.linux32ARMv6/squeak.cog.spur/build/mvm
@@ -2,7 +2,7 @@
 set -e
 # Spur VM with VM profiler and threaded heartbeat
 INSTALLDIR=sqcogspurlinuxhtRPi
-OPT="-m32 -g -O2 -DNDEBUG -DDEBUGVM=0"
+OPT="-g -O2 -DNDEBUG -DDEBUGVM=0"
 
 if [ $# -ge 1 ]; then
 	INSTALLDIR="$1"; shift

From 38c42831095964b612a39fdb700de5d6802f6777 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 28 Apr 2021 17:09:20 +0100
Subject: [PATCH 02/17] Correct various "#if ENABLE_FAST_BLT" to "#ifdef"

ENABLE_FAST_BLT is typically not assigned a value even when it is defined,
so "#if" form is tecnically a syntax error.
---
 src/plugins/BitBltPlugin/BitBltPlugin.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c
index f9fa00d119..7c7d952f01 100644
--- a/src/plugins/BitBltPlugin/BitBltPlugin.c
+++ b/src/plugins/BitBltPlugin/BitBltPlugin.c
@@ -1368,7 +1368,7 @@ copyBits(void)
 	if (!(lockSurfaces())) {
 		return primitiveFail();
 	}
-#  if ENABLE_FAST_BLT
+#  ifdef ENABLE_FAST_BLT
 
 	/* you really, really mustn't call this unless you have the rest of the code to link to */
 	copyBitsFastPathSpecialised();
@@ -1390,7 +1390,7 @@ static sqInt
 copyBitsFastPathSpecialised(void)
 {
 
-#  if ENABLE_FAST_BLT
+#  ifdef ENABLE_FAST_BLT
 
 	/* set the affected area to 0 first */
 	affectedL = (affectedR = (affectedT = (affectedB = 0)));
@@ -1814,7 +1814,7 @@ copyBitsFallback(operation_t *op, unsigned int flags)
 	sqInt t;
 
 
-#  if ENABLE_FAST_BLT
+#  ifdef ENABLE_FAST_BLT
 
 	/* recover values from the operation struct used by the fast ARM code */
 	
@@ -3159,7 +3159,7 @@ initialiseModule(void)
 {
 	initBBOpTable();
 	initDither8Lookup();
-#  if ENABLE_FAST_BLT
+#  ifdef ENABLE_FAST_BLT
 	initialiseCopyBits();
 #  endif
 	return 1;
@@ -5192,7 +5192,7 @@ primitiveCompareColors(void)
 	if (failed()) {
 		return null;
 	}
-#  if ENABLE_FAST_BLT
+#  ifdef ENABLE_FAST_BLT
 	if (!(loadBitBltFromwarping(rcvr, 0))) {
 		return primitiveFail();
 	}

From 43ce97589b7b38fc80bbe6e8151810e7954b676e Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 24 Mar 2021 15:11:05 +0000
Subject: [PATCH 03/17] Don't assume sourcePPW is valid on entry to
 copyBitsFallback

This is not the case when being called from "fuzz" or "bench" test
applications. It may also not be accurate if a fast path has been
synthesised from a combination of copyBitsFallback and one or more other
fast paths.
---
 src/plugins/BitBltPlugin/BitBltPlugin.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c
index 7c7d952f01..482e8b5de1 100644
--- a/src/plugins/BitBltPlugin/BitBltPlugin.c
+++ b/src/plugins/BitBltPlugin/BitBltPlugin.c
@@ -1852,6 +1852,7 @@ copyBitsFallback(operation_t *op, unsigned int flags)
 		ungammaLookupTable = (void *) op->opt.componentAlpha.ungammaLookupTable;
 	}
 	destPPW = 32 / destDepth;
+	sourcePPW = 32 / sourceDepth;
 	cmBitsPerColor = 0;
 	if (cmMask == 0x1FF) {
 		cmBitsPerColor = 3;

From df667cab1cbab9829ef426a4a6865e48f6b7d57f Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 24 Mar 2021 17:14:55 +0000
Subject: [PATCH 04/17] Fallback routines need extra help to detect intra-image
 operations

In some places, sourceForm and destForm were being compared to determine
which code path to follow. However, when being called from fuzz or other
test tools, these structures aren't used to pass parameters, so the pointers
haven't been initialised and default to 0, so the wrong code path is followed.
Detect such cases and initialise them from sourceBits and destBits instead,
since these will perform the same under equality tests.
---
 src/plugins/BitBltPlugin/BitBltPlugin.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c
index 482e8b5de1..05a757d0c5 100644
--- a/src/plugins/BitBltPlugin/BitBltPlugin.c
+++ b/src/plugins/BitBltPlugin/BitBltPlugin.c
@@ -1863,6 +1863,14 @@ copyBitsFallback(operation_t *op, unsigned int flags)
 	if (cmMask == 0x7FFF) {
 		cmBitsPerColor = 5;
 	}
+	/* In some places, sourceForm and destForm are compared in order to detect
+	 * whether we're reading and writing the same image. However, these have
+	 * not always been initialised by the time we get here, so substitute
+	 * sourceBits and destBits if so. */
+	if (sourceForm == 0 && destForm == 0) {
+	    sourceForm = sourceBits;
+	    destForm = destBits;
+	}
 	/* begin tryCopyingBitsQuickly */
 	if (noSource) {
 		done = 0;

From 51df83e0b30106b2c0958945f7eb0486b6510709 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 25 Mar 2021 15:42:39 +0000
Subject: [PATCH 05/17] Remove invalid shortcut in rgbComponentAlphawith

This shortcut is triggered more frequently than it used to be, due to
improvements in copyLoop() that avoid buffer overruns.
---
 src/plugins/BitBltPlugin/BitBltPlugin.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/plugins/BitBltPlugin/BitBltPlugin.c b/src/plugins/BitBltPlugin/BitBltPlugin.c
index 05a757d0c5..428eeecbf4 100644
--- a/src/plugins/BitBltPlugin/BitBltPlugin.c
+++ b/src/plugins/BitBltPlugin/BitBltPlugin.c
@@ -6392,9 +6392,13 @@ rgbComponentAlphawith(sqInt sourceWord, sqInt destinationWord)
 	sqInt v;
 
 	alpha = sourceWord;
+#if 0
+	/* This is not a valid optimisation because alpha == 0 can change the destination
+	 * due to rounding errors in blending and/or in gamma lookup round-trip */
 	if (alpha == 0) {
 		return destinationWord;
 	}
+#endif
 	/* begin partitionedRgbComponentAlpha:dest:nBits:nPartitions: */
 
 	/* partition mask starts at the right */

From a64c5b69b9f93a03ac2146799e67fe726943d684 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 8 Apr 2021 10:33:39 +0100
Subject: [PATCH 06/17] Fix bug in 32-bit ARM fast paths

When classed as "wide" because each line is long enough to warrant pipelined
prefetching as we go along, the inner loop is unrolled enough that there is
at least one prefetch instruction per iteration. Loading the source image
can only be done in atoms of 32 bit words due to big-endian packing, so
when destination pixels are 8 times wider (or more) than source pixels, the
loads happen less frequently than the store atoms (quadwords) and a
conditional branch per subblock is required to decide whether to do a load
or not, depending on the skew and the number of pixels remaining to process.
The 'x' register is only updated once per loop, so an assembly-time constant
derived from the unrolling subblock number needs to be factored in, but
since the number of pixels remaining decreases as the subblock number
increases, this should have been a subtraction.

In practice, since only the least-significant bits of the result matter,
addition and subtraction behave the same when the source:destination pixel
ratio is 8, so the only operations affected were 1->16bpp, 2->32bpp and
1->32bpp. The exact threshold that counts as "wide" depends on the prefetch
distance that was selected empirically, but typically would require an
operation that is several hundreds of pixels wide.
---
 platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
index efe013cea7..a48a13c4b8 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
@@ -1542,7 +1542,7 @@ subblock SETA   0
         WHILE   subblock < pix_per_block*dst_w_bpp/128
       [ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
         ASSERT  flags & FLAG_MAX_256BIT_MACRO = 0
-        AddL    scratch, x, (prefetch_distance+2)*pix_per_block + subblock*128/dst_w_bpp
+        AddL    scratch, x, (prefetch_distance+2)*pix_per_block - subblock*128/dst_w_bpp
         TST     scratch, #32/src_bpp - 128/dst_w_bpp
         BNE     %FT53
         Read1Word src, 0, carry, &$fixed_skew, skew, scratch

From b577ab2e4dc476c3ee740fa54af755d864995f42 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Tue, 13 Apr 2021 12:24:26 +0100
Subject: [PATCH 07/17] Fix buffer overflow bugs

In fastPathDepthConv (which combines sourceWord colour-depth conversion with
another fast path for another combinationRule at a constant colour depth)
and fastPathRightToLeft, it could overflow the temporary buffer and thereby
corrupt other local variables if the last chunk of a pixel row was 2048
bytes (or just under). This was most likely to happen with 32bpp destination
images and widths of about 512 pixels.
---
 .../Cross/plugins/BitBltPlugin/BitBltGeneric.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index 762ce7780f..cb474b1f95 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -260,8 +260,13 @@ static void fastPathRightToLeft(operation_t *op, uint32_t flags)
 	uint32_t width = op->width;
 	uint32_t height = op->height;
 
-	uint8_t tempBuffer[CACHELINE_LEN * 64];
-#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN))
+#define BUFFER_LEN_CACHELINES 64
+	/* Need enough room to be able to quickly find BUFFER_LEN_CACHELINES
+	 * starting at any cacheline alignment
+	 */
+	uint8_t spaceForTempBuffer[CACHELINE_LEN * (BUFFER_LEN_CACHELINES + 2)];
+	uint8_t *tempBuffer = (uint8_t *)(((uintptr_t) spaceForTempBuffer + CACHELINE_LEN - 1) &~ (CACHELINE_LEN - 1));
+#define BUFFER_LEN_PIXELS (cacheline_len * BUFFER_LEN_CACHELINES)
 	opFromSrc.dest.bits = tempBuffer;
 	opFromSrc.dest.y = 0;
 	opFromSrc.height = 1;
@@ -377,8 +382,13 @@ static void fastPathDepthConv(operation_t *op, uint32_t flags)
 	uint32_t width = op->width;
 	uint32_t height = op->height;
 
-	uint8_t tempBuffer[CACHELINE_LEN * 64];
-#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN))
+#define BUFFER_LEN_CACHELINES 64
+	/* Need enough room to be able to quickly find BUFFER_LEN_CACHELINES
+	 * starting at any cacheline alignment
+	 */
+	uint8_t spaceForTempBuffer[CACHELINE_LEN * (BUFFER_LEN_CACHELINES + 2)];
+	uint8_t *tempBuffer = (uint8_t *)(((uintptr_t) spaceForTempBuffer + CACHELINE_LEN - 1) &~ (CACHELINE_LEN - 1));
+#define BUFFER_LEN_PIXELS (cacheline_len * BUFFER_LEN_CACHELINES)
 	opFromSrc.combinationRule = CR_sourceWord;
 	opFromSrc.dest.bits = tempBuffer;
 	opFromSrc.dest.y = 0;

From 9084c1757bccb2216dc5e704743132f2df9f854a Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Tue, 13 Apr 2021 17:20:54 +0100
Subject: [PATCH 08/17] Fix corruption bugs with wide 1bpp source images

For images that were wide enough to invoke intra-line preloads, there was a
register clash between the preload address calculation and one of the
registers holding the deskewed source pixels (this only occurred once per
destination cacheline).
---
 .../BitBltPlugin/BitBltArmSimdSourceWord.s      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s
index 27233f5d5f..ec5b1737b9 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s
@@ -528,7 +528,7 @@ counter SETA    counter + 4
 
 SourceWord GenerateFunctions 1, 4,, \
   FLAG_COLOUR_MAP :OR: FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS :OR: FLAG_NO_EXPAND_SKEW, 2, \
-  "stride_s,map,bitptrs,orig_w,scratch", \
+  "stride_s,map,bitptrs,scratch,orig_w", \
   "x,stride_s,bitptrs", orig_w,, init ; leading_pixels_reg=wk3
 
 ; ********************************************************************
@@ -951,7 +951,7 @@ counter SETA    counter + 4
 
         MACRO
         SourceWord1_2_128bits_head $src, $fixed_skew, $intra_preloads
-        Read2Words src, 3, carry, $fixed_skew, skew, $wk0
+        Read2Words src, 2, carry, $fixed_skew, skew, $wk0
         MEND
 
         MACRO
@@ -959,8 +959,8 @@ counter SETA    counter + 4
         LCLA    counter
 counter SETA    0
         WHILE   counter < 16
-        MSR     CPSR_f, $wk3
-        MOV     $wk3, $wk3, LSL #4
+        MSR     CPSR_f, $wk2
+        MOV     $wk2, $wk2, LSL #4
         ORRPL   $wk0, ht, $wk0, LSL #2
         ORRMI   $wk0, ht_info, $wk0, LSL #2
         ORRNE   $wk0, ht, $wk0, LSL #2
@@ -973,8 +973,8 @@ counter SETA    counter + 4
         WEND
 counter SETA    0
         WHILE   counter < 16
-        MSR     CPSR_f, $wk3
-        MOV     $wk3, $wk3, LSL #4
+        MSR     CPSR_f, $wk2
+        MOV     $wk2, $wk2, LSL #4
         ORRPL   $wk1, ht, $wk1, LSL #2
         ORRMI   $wk1, ht_info, $wk1, LSL #2
         ORRNE   $wk1, ht, $wk1, LSL #2
@@ -987,8 +987,13 @@ counter SETA    counter + 4
         WEND
 counter SETA    0
         WHILE   counter < 16
+      [ counter = 0
+        MSR     CPSR_f, $wk3
+        MOV     $wk4, $wk3, LSL #4
+      |
         MSR     CPSR_f, $wk4
         MOV     $wk4, $wk4, LSL #4
+      ]
         ORRPL   $wk2, ht, $wk2, LSL #2
         ORRMI   $wk2, ht_info, $wk2, LSL #2
         ORRNE   $wk2, ht, $wk2, LSL #2

From 2b0279ab33949f75fb192a37864d188d219c461c Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 29 Mar 2021 15:51:51 +0100
Subject: [PATCH 09/17] Fix type of halftone array for 64-bit targets

The halftone array is accessed using a hard-coded multiplier of 4 bytes,
therefore the type of each element needs to be 32 bit on every platform.
`sqInt` is not appropriate for this use, since it is a 64-bit type on
64-bit platforms. Rather than unilaterally introduce C99 stdint types,
use `unsigned int` since this wil be 32-bit on both current fast path
binary targets.
---
 platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
index 78fbfdc059..136b2407e5 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
@@ -116,7 +116,7 @@ typedef struct {
 	unsigned int          (*cmLookupTable)[];
 	bool                    noHalftone;
 	usqInt                  halftoneHeight;
-	sqInt                 (*halftoneBase)[];
+	unsigned int          (*halftoneBase)[];
 	union {
 		sqInt               sourceAlpha;
 		struct {

From e22ae0b9cf2a4b42c99dc2471c4937eb40c39cad Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Tue, 27 Apr 2021 12:25:01 +0100
Subject: [PATCH 10/17] Detect and add a new fast path flag for effective-1bpp
 colour maps

Sometimes, colour maps are used such that all entries except the first
contain the same value. Combined with the fact that only source colour 0
uses colour map entry 0 (any other colours for which all non-0 bits would
otherwise be discarded during index generation are forced to use entry 1
instead), this effectively acts as a 2-entry (or 1bpp) map, depending on
whether the source colour is 0 or not. This is far more efficiently coded
in any fast path by a test against zero, than by a table lookup - it frees
up 2 KB, 16 KB or 128 KB of data cache space, depending on whether a 9-,
12- or 15-bit colour map was used. There is an up-front cost to scanning
the colour map to see if its entries are of this nature, however in most
"normal" colour maps, this scan will rapidly be aborted.
---
 .../plugins/BitBltPlugin/BitBltDispatch.c     | 71 +++++++++++--------
 .../plugins/BitBltPlugin/BitBltInternal.h     | 32 +++++----
 2 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
index 6065dbab83..10a762be43 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
@@ -310,41 +310,56 @@ void copyBitsDispatch(operation_t *op)
 
 		if (op->cmFlags & ColorMapIndexedPart) {
 			if (op->cmFlags & ColorMapFixedPart) {
-				if (op->src.depth == 32) {
-					if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
-						flags |= FAST_PATH_15BIT_COLOR_MAP;
-					else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0)
-						flags |= FAST_PATH_12BIT_COLOR_MAP;
-					else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0)
-						flags |= FAST_PATH_9BIT_COLOR_MAP;
-					else {
-						/* Unsupported case 1 */
-						copyBitsFallback(op, 0);
+				/* First check whether all except the first colour map entry match.
+				 * This indicates that the operation depends only upon whether
+				 * each pixel has value 0 or not (note that non-zero pixels that
+				 * reduce to 0 when bit-packed use lookup index 1)
+				 */
+				usqInt i;
+				flags |= FAST_PATH_1BIT_COLOR_MAP;
+				for (i = op->cmMask; i >= 2; --i) {
+					if ((*op->cmLookupTable)[i] != (*op->cmLookupTable)[1]) {
+						flags &= ~FAST_PATH_1BIT_COLOR_MAP;
+						break;
+					}
+				}
+				if ((flags & FAST_PATH_1BIT_COLOR_MAP) == 0) {
+					if (op->src.depth == 32) {
+						if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
+							flags |= FAST_PATH_15BIT_COLOR_MAP;
+						else if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable84, sizeof maskTable84) == 0 && memcmp(op->cmShiftTable, shiftTable84, sizeof shiftTable84) == 0)
+							flags |= FAST_PATH_12BIT_COLOR_MAP;
+						else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable83, sizeof maskTable83) == 0 && memcmp(op->cmShiftTable, shiftTable83, sizeof shiftTable83) == 0)
+							flags |= FAST_PATH_9BIT_COLOR_MAP;
+						else {
+							/* Unsupported case 1 */
+							copyBitsFallback(op, 0);
 #ifdef PROFILING
-						profile_unrecorded_cases[1]++;
+							profile_unrecorded_cases[1]++;
 #endif
-						return;
-					}
-				} else if (op->src.depth == 16) {
-					if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0)
-						flags |= FAST_PATH_12BIT_COLOR_MAP;
-					else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0)
-						flags |= FAST_PATH_9BIT_COLOR_MAP;
-					else {
-						/* Unsupported case 2 */
+							return;
+						}
+					} else if (op->src.depth == 16) {
+						if (op->cmMask == 0xFFF && memcmp(op->cmMaskTable, maskTable54, sizeof maskTable54) == 0 && memcmp(op->cmShiftTable, shiftTable54, sizeof shiftTable54) == 0)
+							flags |= FAST_PATH_12BIT_COLOR_MAP;
+						else if (op->cmMask == 0x1FF && memcmp(op->cmMaskTable, maskTable53, sizeof maskTable53) == 0 && memcmp(op->cmShiftTable, shiftTable53, sizeof shiftTable53) == 0)
+							flags |= FAST_PATH_9BIT_COLOR_MAP;
+						else {
+							/* Unsupported case 2 */
+							copyBitsFallback(op, 0);
+#ifdef PROFILING
+							profile_unrecorded_cases[2]++;
+#endif
+							return;
+						}
+					} else {
+						/* Unsupported case 3 */
 						copyBitsFallback(op, 0);
 #ifdef PROFILING
-						profile_unrecorded_cases[2]++;
+						profile_unrecorded_cases[3]++;
 #endif
 						return;
 					}
-				} else {
-					/* Unsupported case 3 */
-					copyBitsFallback(op, 0);
-#ifdef PROFILING
-					profile_unrecorded_cases[3]++;
-#endif
-					return;
 				}
 			} else {
 				if ((op->src.depth < 16 && op->cmMask == (1u << op->src.depth) - 1) ||
diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h b/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
index a653fb64c1..f82cb1d24e 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
@@ -62,21 +62,22 @@ enum {
 	FAST_PATH_DEST_LITTLE_ENDIAN = 1u<<16,
 
 	FAST_PATH_NO_COLOR_MAP       = 1u<<17,
-	FAST_PATH_9BIT_COLOR_MAP     = 1u<<18,
-	FAST_PATH_12BIT_COLOR_MAP    = 1u<<19,
-	FAST_PATH_15BIT_COLOR_MAP    = 1u<<20,
+	FAST_PATH_1BIT_COLOR_MAP     = 1u<<18,
+	FAST_PATH_9BIT_COLOR_MAP     = 1u<<19,
+	FAST_PATH_12BIT_COLOR_MAP    = 1u<<20,
+	FAST_PATH_15BIT_COLOR_MAP    = 1u<<21,
 	FAST_PATH_DIRECT_COLOR_MAP   = FAST_PATH_15BIT_COLOR_MAP, /* for use with <16bpp */
 
-	FAST_PATH_NO_HALFTONE        = 1u<<21,
-	FAST_PATH_SCALAR_HALFTONE    = 1u<<22,
-	FAST_PATH_VECTOR_HALFTONE    = 1u<<23,
+	FAST_PATH_NO_HALFTONE        = 1u<<22,
+	FAST_PATH_SCALAR_HALFTONE    = 1u<<23,
+	FAST_PATH_VECTOR_HALFTONE    = 1u<<24,
 
-	FAST_PATH_CA_NO_GAMMA        = 1u<<24,
-	FAST_PATH_CA_HAS_GAMMA       = 1u<<25,
+	FAST_PATH_CA_NO_GAMMA        = 1u<<25,
+	FAST_PATH_CA_HAS_GAMMA       = 1u<<26,
 
-	FAST_PATH_NO_OVERLAP         = 1u<<26,
-	FAST_PATH_H_OVERLAP          = 1u<<27,
-	FAST_PATH_V_OVERLAP          = 1u<<28,
+	FAST_PATH_NO_OVERLAP         = 1u<<27,
+	FAST_PATH_H_OVERLAP          = 1u<<28,
+	FAST_PATH_V_OVERLAP          = 1u<<29,
 };
 
 /* These are the derived macros for use in specifying fast paths. */
@@ -102,10 +103,11 @@ enum {
 #define ONLY_DEST_BIG_ENDIAN    (FAST_PATH_DEST_LITTLE_ENDIAN)
 #define ONLY_DEST_LITTLE_ENDIAN (FAST_PATH_DEST_LITTLE_ENDIAN)
 
-#define ONLY_NO_COLOR_MAP       (FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP)
-#define ONLY_9BIT_COLOR_MAP     (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP)
-#define ONLY_12BIT_COLOR_MAP    (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_9BIT_COLOR_MAP  | FAST_PATH_15BIT_COLOR_MAP)
-#define ONLY_15BIT_COLOR_MAP    (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_9BIT_COLOR_MAP  | FAST_PATH_12BIT_COLOR_MAP)
+#define ONLY_NO_COLOR_MAP       (FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP)
+#define ONLY_1BIT_COLOR_MAP     (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_9BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP)
+#define ONLY_9BIT_COLOR_MAP     (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_12BIT_COLOR_MAP | FAST_PATH_15BIT_COLOR_MAP)
+#define ONLY_12BIT_COLOR_MAP    (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP  | FAST_PATH_15BIT_COLOR_MAP)
+#define ONLY_15BIT_COLOR_MAP    (FAST_PATH_NO_COLOR_MAP   | FAST_PATH_1BIT_COLOR_MAP | FAST_PATH_9BIT_COLOR_MAP  | FAST_PATH_12BIT_COLOR_MAP)
 #define ONLY_DIRECT_COLOR_MAP   ONLY_15BIT_COLOR_MAP /* for use with <16bpp */
 
 #define ONLY_NO_HALFTONE        (FAST_PATH_SCALAR_HALFTONE | FAST_PATH_VECTOR_HALFTONE)

From e44e2c89783fe8b46d9ee7eec1242bfb97f91dfb Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 31 Mar 2021 12:01:47 +0100
Subject: [PATCH 11/17] C fast path for 32bpp alphaBlend

This runs approx 2.6x faster when benchmarked on Cortex-A72 in AArch64.
---
 .../plugins/BitBltPlugin/BitBltGeneric.c      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index cb474b1f95..526ab7fc8d 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -205,6 +205,37 @@ static void fastPathSourceWord32_32(operation_t *op, uint32_t flags)
 	} while (--height > 0);
 }
 
+static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    uint32_t *src = srcBits + srcPitch * srcY + srcX;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+    srcPitch -= width;
+    destPitch -= width;
+    do {
+        uint32_t remain = width;
+        do {
+            uint32_t s = *src++;
+            uint32_t d = *dest;
+            unsigned int alpha = (s >> 24) & 0xFF;
+            unsigned int unAlpha = 0xFF - alpha;
+            uint32_t sAG = ((s >> 8) & 0xFF) | 0xFF0000;
+            uint32_t sRB = s & 0xFF00FF;
+            uint32_t dAG = (d >> 8) & 0xFF00FF;
+            uint32_t dRB = d & 0xFF00FF;
+            uint32_t blendAG = sAG * alpha + dAG * unAlpha + 0xFF00FF;
+            uint32_t blendRB = sRB * alpha + dRB * unAlpha + 0xFF00FF;
+            blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF;
+            blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF;
+            d = (blendAG << 8) | blendRB;
+            *dest++ = d;
+        } while (--remain > 0);
+        src += srcPitch;
+        dest += destPitch;
+    } while (--height > 0);
+}
+
 
 static void fastPathRightToLeft(operation_t *op, uint32_t flags)
 {
@@ -453,6 +484,7 @@ static fast_path_t fastPaths[] = {
 		{ fastPathSourceWord0_32_scalar, CR_sourceWord,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
 
 		{ fastPathSourceWord32_32,       CR_sourceWord,      STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP },
+		{ fastPathAlphaBlend32_32,       CR_alphaBlend,      STD_FLAGS(32,32,NO,NO) },
 		{ fastPathNoOp,                  CR_destinationWord, 0 },
 
 		/* Some special fast paths to extend the abilities of the others in corner cases */

From 45649aa5af90d36df13dd574e7ef31148b732035 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Fri, 23 Apr 2021 13:39:05 +0100
Subject: [PATCH 12/17] C fast path for planar alphaBlend

---
 .../plugins/BitBltPlugin/BitBltGeneric.c      | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index 526ab7fc8d..616ff78a91 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -205,6 +205,36 @@ static void fastPathSourceWord32_32(operation_t *op, uint32_t flags)
 	} while (--height > 0);
 }
 
+static void fastPathAlphaBlend0_32_scalar(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    uint32_t *dest = destBits + destPitch * destY + destX;
+    destPitch -= width;
+    uint32_t s = (*op->halftoneBase)[0];
+    unsigned int alpha = (s >> 24) & 0xFF;
+    unsigned int unAlpha = 0xFF - alpha;
+    uint32_t sAG = ((s >> 8) & 0xFF) | 0xFF0000;
+    uint32_t sRB = s & 0xFF00FF;
+    sAG *= alpha;
+    sRB *= alpha;
+    do {
+        uint32_t remain = width;
+        do {
+            uint32_t d = *dest;
+            uint32_t dAG = (d >> 8) & 0xFF00FF;
+            uint32_t dRB = d & 0xFF00FF;
+            uint32_t blendAG = sAG + dAG * unAlpha + 0xFF00FF;
+            uint32_t blendRB = sRB + dRB * unAlpha + 0xFF00FF;
+            blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF;
+            blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF;
+            d = (blendAG << 8) | blendRB;
+            *dest++ = d;
+        } while (--remain > 0);
+        dest += destPitch;
+    } while (--height > 0);
+}
+
 static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags)
 {
     IGNORE(flags);
@@ -484,6 +514,7 @@ static fast_path_t fastPaths[] = {
 		{ fastPathSourceWord0_32_scalar, CR_sourceWord,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
 
 		{ fastPathSourceWord32_32,       CR_sourceWord,      STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP },
+		{ fastPathAlphaBlend0_32_scalar, CR_alphaBlend,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
 		{ fastPathAlphaBlend32_32,       CR_alphaBlend,      STD_FLAGS(32,32,NO,NO) },
 		{ fastPathNoOp,                  CR_destinationWord, 0 },
 

From 405f35babf738496b28b0849fc7086b078358e46 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 14 Apr 2021 15:20:14 +0100
Subject: [PATCH 13/17] C fast path for 8->32bpp conversion

---
 .../plugins/BitBltPlugin/BitBltGeneric.c      | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index 616ff78a91..04d255440f 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -192,6 +192,109 @@ static void fastPathSourceWord0_32_scalar(operation_t *op, uint32_t flags)
 	} while (--height > 0);
 }
 
+static void fastPathSourceWord8_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint8_t, uint32_t);
+    uint8_t *src = srcBits + srcPitch * srcY + (srcX ^ 3);
+    uint32_t *dest = destBits + destPitch * destY + destX;
+    switch ((srcX + width) & 3) {
+    case 0:
+        do {
+            int32_t x = width;
+            uint8_t *s = src;
+            if ((x & 3) >= 3)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 2)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 1)
+            {
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+            }
+            for (x -= 4; x >= 0; x -= 4) {
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+            }
+            src += srcPitch;
+            dest += destPitch - width;
+        } while (--height > 0);
+        break;
+    case 1:
+        do {
+            int32_t x = width;
+            uint8_t *s = src;
+            if ((x & 3) >= 3)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 2)
+            {
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+            }
+            if ((x & 3) >= 1)
+                *dest++ = (*cmLookupTable)[*s--];
+            for (x -= 4; x >= 0; x -= 4) {
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+                *dest++ = (*cmLookupTable)[*s--];
+            }
+            src += srcPitch;
+            dest += destPitch - width;
+        } while (--height > 0);
+        break;
+    case 2:
+        do {
+            int32_t x = width;
+            uint8_t *s = src;
+            if ((x & 3) >= 3)
+            {
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+            }
+            if ((x & 3) >= 2)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 1)
+                *dest++ = (*cmLookupTable)[*s--];
+            for (x -= 4; x >= 0; x -= 4) {
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+            }
+            src += srcPitch;
+            dest += destPitch - width;
+        } while (--height > 0);
+        break;
+    case 3:
+        do {
+            int32_t x = width;
+            uint8_t *s = src;
+            if ((x & 3) >= 3)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 2)
+                *dest++ = (*cmLookupTable)[*s--];
+            if ((x & 3) >= 1)
+                *dest++ = (*cmLookupTable)[*s--];
+            for (x -= 4; x >= 0; x -= 4) {
+                *dest++ = (*cmLookupTable)[*s];
+                s += 7;
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+                *dest++ = (*cmLookupTable)[*s--];
+            }
+            src += srcPitch;
+            dest += destPitch - width;
+        } while (--height > 0);
+        break;
+    }
+}
+
 static void fastPathSourceWord32_32(operation_t *op, uint32_t flags)
 {
 	IGNORE(flags);
@@ -513,6 +616,7 @@ static fast_path_t fastPaths[] = {
 		{ fastPathClearWord32,           CR_clearWord,       STD_FLAGS_NO_SOURCE(32,NO) },
 		{ fastPathSourceWord0_32_scalar, CR_sourceWord,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
 
+		{ fastPathSourceWord8_32,        CR_sourceWord,      STD_FLAGS(8,32,DIRECT,NO) },
 		{ fastPathSourceWord32_32,       CR_sourceWord,      STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP },
 		{ fastPathAlphaBlend0_32_scalar, CR_alphaBlend,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
 		{ fastPathAlphaBlend32_32,       CR_alphaBlend,      STD_FLAGS(32,32,NO,NO) },

From dac723ffd268789b93124573669e541dee9616e1 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Tue, 27 Apr 2021 12:35:49 +0100
Subject: [PATCH 14/17] C fast path for alphaBlend with 1bpp colour map and
 scalar halftone

---
 .../plugins/BitBltPlugin/BitBltGeneric.c      | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index 04d255440f..e32b543d7b 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -338,6 +338,42 @@ static void fastPathAlphaBlend0_32_scalar(operation_t *op, uint32_t flags)
     } while (--height > 0);
 }
 
+static void fastPathAlphaBlend32_32_map1_scalar(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    uint32_t *src = srcBits + srcPitch * srcY + srcX;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+    srcPitch -= width;
+    destPitch -= width;
+    uint32_t s[2] = { (*op->cmLookupTable)[0] & (*op->halftoneBase)[0], (*op->cmLookupTable)[1] & (*op->halftoneBase)[0] };
+    unsigned int alpha[2] = { (s[0] >> 24) & 0xFF, (s[1] >> 24) & 0xFF };
+    unsigned int unAlpha[2] = { 0xFF - alpha[0], 0xFF - alpha[1] };
+    uint32_t sAG[2] = { ((s[0] >> 8) & 0xFF) | 0xFF0000, ((s[1] >> 8) & 0xFF) | 0xFF0000 };
+    uint32_t sRB[2] = { s[0] & 0xFF00FF, s[1] & 0xFF00FF };
+    sAG[0] *= alpha[0];
+    sAG[1] *= alpha[1];
+    sRB[0] *= alpha[0];
+    sRB[1] *= alpha[1];
+    do {
+        uint32_t remain = width;
+        do {
+            uint32_t s = !!*src++;
+            uint32_t d = *dest;
+            uint32_t dAG = (d >> 8) & 0xFF00FF;
+            uint32_t dRB = d & 0xFF00FF;
+            uint32_t blendAG = sAG[s] + dAG * unAlpha[s] + 0xFF00FF;
+            uint32_t blendRB = sRB[s] + dRB * unAlpha[s] + 0xFF00FF;
+            blendAG = ((((blendAG >> 8) & 0xFF00FF) + blendAG) >> 8) & 0xFF00FF;
+            blendRB = ((((blendRB >> 8) & 0xFF00FF) + blendRB) >> 8) & 0xFF00FF;
+            d = (blendAG << 8) | blendRB;
+            *dest++ = d;
+        } while (--remain > 0);
+        src += srcPitch;
+        dest += destPitch;
+    } while (--height > 0);
+}
+
 static void fastPathAlphaBlend32_32(operation_t *op, uint32_t flags)
 {
     IGNORE(flags);
@@ -619,6 +655,7 @@ static fast_path_t fastPaths[] = {
 		{ fastPathSourceWord8_32,        CR_sourceWord,      STD_FLAGS(8,32,DIRECT,NO) },
 		{ fastPathSourceWord32_32,       CR_sourceWord,      STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP },
 		{ fastPathAlphaBlend0_32_scalar, CR_alphaBlend,      STD_FLAGS_NO_SOURCE(32,SCALAR) },
+		{ fastPathAlphaBlend32_32_map1_scalar, CR_alphaBlend, STD_FLAGS(32,32,1BIT,SCALAR) },
 		{ fastPathAlphaBlend32_32,       CR_alphaBlend,      STD_FLAGS(32,32,NO,NO) },
 		{ fastPathNoOp,                  CR_destinationWord, 0 },
 

From 80cd2da48177ef814c9ce3905ee66866a9b2fbc0 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 28 Apr 2021 16:32:56 +0100
Subject: [PATCH 15/17] Apply scalar halftoning to colour map entries instead
 for 32bpp destination

This makes better use of existing fast paths, and applies to all platforms.
---
 .../plugins/BitBltPlugin/BitBltGeneric.c      | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
index e32b543d7b..677c886688 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
@@ -640,6 +640,30 @@ static void fastPathDepthConv(operation_t *op, uint32_t flags)
 	} while (--height > 0);
 }
 
+static void fastPathEarlyHalftone(operation_t *op, uint32_t flags)
+{
+	/* With a 32bpp destination, the action of halftoning has no dependency on
+	 * the destination X coordinate of a pixel. Furthermore, scalar halftoning
+	 * doesn't depend on the destination Y coordinate either. If there's a
+	 * colour map as well, this means we can apply the halftone to the colour
+	 * map entries instead, which makes it easier to find a match for the
+	 * remaining part of the operation from amongst the available fast paths.
+	 */
+	uint32_t flags2 = (flags &~ FAST_PATH_SCALAR_HALFTONE) | FAST_PATH_NO_HALFTONE;
+	void (*func)(operation_t *, uint32_t) = lookupFastPath(op->combinationRule, flags2);
+	if (func == NULL) {
+		copyBitsFallback(op, flags);
+	} else {
+		uint32_t cmLookupTable2[1<<15];
+		for (int32_t i = op->cmMask; i >= 0; --i)
+			cmLookupTable2[i] = (*op->cmLookupTable)[i] & (*op->halftoneBase)[0];
+		operation_t op2 = *op;
+		op2.cmLookupTable = &cmLookupTable2;
+		op2.noHalftone = true;
+		func(&op2, flags2);
+	}
+}
+
 static void fastPathNoOp(operation_t *op, uint32_t flags)
 {
 	IGNORE(op);
@@ -662,6 +686,7 @@ static fast_path_t fastPaths[] = {
 		/* Some special fast paths to extend the abilities of the others in corner cases */
 		{ fastPathRightToLeft,           CR_any,             FAST_PATH_VECTOR_HALFTONE | ONLY_H_OVERLAP },
 		{ fastPathBottomToTop,           CR_any,             FAST_PATH_VECTOR_HALFTONE | ONLY_V_OVERLAP },
+		{ fastPathEarlyHalftone,         CR_any,             FAST_PATH_SRC_0BPP | ONLY_DEST_32BPP | FAST_PATH_NO_COLOR_MAP | ONLY_SCALAR_HALFTONE },
 		{ fastPathDepthConv,             CR_any,             FAST_PATH_SRC_0BPP | FAST_PATH_SRC_32BPP | ONLY_DEST_32BPP },
 		{ fastPathDepthConv,             CR_any,             FAST_PATH_SRC_0BPP | FAST_PATH_SRC_16BPP | ONLY_DEST_16BPP },
 		{ fastPathDepthConv,             CR_any,             FAST_PATH_SRC_0BPP | FAST_PATH_SRC_8BPP  | ONLY_DEST_8BPP },

From e4a27ec89afebd9a88dc3ab07992b7daab0940cc Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 24 Mar 2021 14:08:15 +0000
Subject: [PATCH 16/17] Enable fast blit code for AArch64

---
 build.linux64ARMv8/squeak.cog.spur/build.assert/mvm |  1 +
 build.linux64ARMv8/squeak.cog.spur/build.debug/mvm  |  1 +
 build.linux64ARMv8/squeak.cog.spur/build/mvm        |  1 +
 platforms/unix/config/configure                     | 12 ++++++++++++
 platforms/unix/plugins/BitBltPlugin/acinclude.m4    | 11 +++++++++++
 5 files changed, 26 insertions(+)

diff --git a/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm b/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm
index 5d272d3adf..6be9484c68 100755
--- a/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm
+++ b/build.linux64ARMv8/squeak.cog.spur/build.assert/mvm
@@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
 test -f config.h || ../../../platforms/unix/config/configure \
 	--with-vmversion=5.0 --with-src=spur64src \
 	--without-vm-display-fbdev --without-npsqueak \
+	--enable-fast-bitblt \
 	CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
 	LIBS="-lrt"
 
diff --git a/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm b/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm
index 6227b8afe0..abe003b808 100755
--- a/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm
+++ b/build.linux64ARMv8/squeak.cog.spur/build.debug/mvm
@@ -21,6 +21,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
 test -f config.h || ../../../platforms/unix/config/configure \
 	--with-vmversion=5.0 --with-src=spur64src \
 	--without-vm-display-fbdev --without-npsqueak \
+	--enable-fast-bitblt \
 	CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
 	LIBS="-lrt"
 
diff --git a/build.linux64ARMv8/squeak.cog.spur/build/mvm b/build.linux64ARMv8/squeak.cog.spur/build/mvm
index b2bcf24c4f..63f42b7253 100755
--- a/build.linux64ARMv8/squeak.cog.spur/build/mvm
+++ b/build.linux64ARMv8/squeak.cog.spur/build/mvm
@@ -22,6 +22,7 @@ test -f plugins.ext || (test -f ../plugins.ext && cp -p ../plugins.ext . || cp -
 test -f config.h || ../../../platforms/unix/config/configure \
 	--with-vmversion=5.0 --with-src=spur64src \
 	--without-npsqueak \
+	--enable-fast-bitblt \
 	CFLAGS="$MACHINE $OPT -DCOGMTVM=0 -DDUAL_MAPPED_CODE_ZONE=1" \
 	LIBS="-lrt"
 ##	--without-vm-display-fbdev --without-npsqueak \
diff --git a/platforms/unix/config/configure b/platforms/unix/config/configure
index 7b7c747b75..a2f424d486 100755
--- a/platforms/unix/config/configure
+++ b/platforms/unix/config/configure
@@ -18395,6 +18395,18 @@ if test "${enable_fast_bitblt+set}" = set; then :
 
 fi
 
+;;
+
+aarch64)
+# Check whether --enable-fast-bitblt was given.
+if test "${enable_fast_bitblt+set}" = set; then :
+  enableval=$enable_fast_bitblt;  if   test "x$enableval" = "xyes" ; then
+      bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o"
+      bitblt_flags="-DENABLE_FAST_BLT"
+   fi
+
+fi
+
 ;;
 esac
 
diff --git a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 b/platforms/unix/plugins/BitBltPlugin/acinclude.m4
index a999922c49..146b16acee 100644
--- a/platforms/unix/plugins/BitBltPlugin/acinclude.m4
+++ b/platforms/unix/plugins/BitBltPlugin/acinclude.m4
@@ -17,6 +17,17 @@ AC_ARG_ENABLE(fast-bitblt,
  ],
  [])
 ;;
+
+aarch64)
+AC_ARG_ENABLE(fast-bitblt,
+ [  --enable-fast-bitblt enable fast BitBlt optimizations (default=no)],
+ [ if   test "x$enableval" = "xyes" ; then
+      bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o"
+      bitblt_flags="-DENABLE_FAST_BLT"
+   fi
+ ],
+ [])
+;;
 esac
 
 AC_SUBST(BITBLT_OBJS, $bitblt_objs)

From 10d8a11296926346152fa2556864201ee7a0e166 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 31 Mar 2021 12:04:45 +0100
Subject: [PATCH 17/17] AArch64 assembly optimisations

---
 .../Cross/plugins/BitBltPlugin/BitBltArm64.c  | 2482 +++++++++++++++++
 .../Cross/plugins/BitBltPlugin/BitBltArm64.h  |   30 +
 .../plugins/BitBltPlugin/BitBltDispatch.c     |   55 +-
 platforms/unix/config/configure               |    2 +-
 .../unix/plugins/BitBltPlugin/acinclude.m4    |    2 +-
 5 files changed, 2568 insertions(+), 3 deletions(-)
 create mode 100644 platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c
 create mode 100644 platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h

diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c
new file mode 100644
index 0000000000..5ad9db04c4
--- /dev/null
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.c
@@ -0,0 +1,2482 @@
+/*
+ * Copyright © 2021 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "BitBltInternal.h"
+
+#define YES(x) x
+#define NO(x)
+
+#define LOAD_SRC_512_INTERLEAVE_YES                                  \
+                "ld4     {v0.16b-v3.16b}, [%[src]], #512/8     \n\t" \
+
+#define LOAD_SRC_512_INTERLEAVE_NO                                   \
+                "ld1     {v0.16b-v3.16b}, [%[src]], #512/8     \n\t" \
+
+#define LOAD_DEST_512_INTERLEAVE_YES                                 \
+                "ld4     {v4.16b-v7.16b}, [%[dest]]            \n\t" \
+                "prfm    pstl1strm, [%[dest]]                  \n\t" \
+
+#define LOAD_DEST_512_INTERLEAVE_NO                                  \
+                "ld1     {v4.16b-v7.16b}, [%[dest]]            \n\t" \
+                "prfm    pstl1strm, [%[dest]]                  \n\t" \
+
+#define STORE_DEST_512_INTERLEAVE_YES                                \
+                "st4     {v0.16b-v3.16b}, [%[dest]]            \n\t" \
+
+#define STORE_DEST_512_INTERLEAVE_NO                                 \
+                "st1     {v0.16b-v3.16b}, [%[dest]]            \n\t" \
+
+#define LOAD_SRC_256_INTERLEAVE_YES                                  \
+                "ld4     {v0.8b-v3.8b}, [%[src]], #256/8       \n\t" \
+
+#define LOAD_SRC_256_INTERLEAVE_NO                                   \
+                "ld1     {v0.8b-v3.8b}, [%[src]], #256/8       \n\t" \
+
+#define LOAD_DEST_256_INTERLEAVE_YES                                 \
+                "ld4     {v4.8b-v7.8b}, [%[dest]]              \n\t" \
+
+#define LOAD_DEST_256_INTERLEAVE_NO                                  \
+                "ld1     {v4.8b-v7.8b}, [%[dest]]              \n\t" \
+
+#define STORE_DEST_256_INTERLEAVE_YES                                \
+                "st4     {v0.8b-v3.8b}, [%[dest]]              \n\t" \
+
+#define STORE_DEST_256_INTERLEAVE_NO                                 \
+                "st1     {v0.8b-v3.8b}, [%[dest]]              \n\t" \
+
+#define UNPACK_YES(sz,rw)                                            \
+                "uzp1    v16."#sz"b,  v0."#sz"b,  v1."#sz"b    \n\t" \
+                "uzp2    v17."#sz"b,  v0."#sz"b,  v1."#sz"b    \n\t" \
+                rw("uzp1 v20."#sz"b,  v4."#sz"b,  v5."#sz"b    \n\t")\
+                rw("uzp2 v21."#sz"b,  v4."#sz"b,  v5."#sz"b    \n\t")\
+                "uzp1    v18."#sz"b,  v2."#sz"b,  v3."#sz"b    \n\t" \
+                "uzp2    v19."#sz"b,  v2."#sz"b,  v3."#sz"b    \n\t" \
+                rw("uzp1 v22."#sz"b,  v6."#sz"b,  v7."#sz"b    \n\t")\
+                rw("uzp2 v23."#sz"b,  v6."#sz"b,  v7."#sz"b    \n\t")\
+                "uzp1     v0."#sz"b, v16."#sz"b, v18."#sz"b    \n\t" \
+                "uzp2     v2."#sz"b, v16."#sz"b, v18."#sz"b    \n\t" \
+                rw("uzp1  v4."#sz"b, v20."#sz"b, v22."#sz"b    \n\t")\
+                rw("uzp2  v6."#sz"b, v20."#sz"b, v22."#sz"b    \n\t")\
+                "uzp1     v1."#sz"b, v17."#sz"b, v19."#sz"b    \n\t" \
+                "uzp2     v3."#sz"b, v17."#sz"b, v19."#sz"b    \n\t" \
+                rw("uzp1  v5."#sz"b, v21."#sz"b, v23."#sz"b    \n\t")\
+                rw("uzp2  v7."#sz"b, v21."#sz"b, v23."#sz"b    \n\t")\
+
+#define UNPACK_NO(sz,rw) /* nothing */
+
+#define UNPACK_DEST_YES(sz)                                          \
+                "uzp1     v0."#sz"b,  v4."#sz"b,  v5."#sz"b    \n\t" \
+                "uzp2     v1."#sz"b,  v4."#sz"b,  v5."#sz"b    \n\t" \
+                "uzp1     v2."#sz"b,  v6."#sz"b,  v7."#sz"b    \n\t" \
+                "uzp2     v3."#sz"b,  v6."#sz"b,  v7."#sz"b    \n\t" \
+                "uzp1     v4."#sz"b,  v0."#sz"b,  v2."#sz"b    \n\t" \
+                "uzp2     v6."#sz"b,  v0."#sz"b,  v2."#sz"b    \n\t" \
+                "uzp1     v5."#sz"b,  v1."#sz"b,  v3."#sz"b    \n\t" \
+                "uzp2     v7."#sz"b,  v1."#sz"b,  v3."#sz"b    \n\t" \
+
+#define UNPACK_DEST_NO(sz) /* nothing */
+
+#define PACK_YES(sz)                                                 \
+                "zip1    v4."#sz"b, v0."#sz"b, v2."#sz"b       \n\t" \
+                "zip2    v6."#sz"b, v0."#sz"b, v2."#sz"b       \n\t" \
+                "zip1    v5."#sz"b, v1."#sz"b, v3."#sz"b       \n\t" \
+                "zip2    v7."#sz"b, v1."#sz"b, v3."#sz"b       \n\t" \
+                "zip1    v0."#sz"b, v4."#sz"b, v5."#sz"b       \n\t" \
+                "zip2    v1."#sz"b, v4."#sz"b, v5."#sz"b       \n\t" \
+                "zip1    v2."#sz"b, v6."#sz"b, v7."#sz"b       \n\t" \
+                "zip2    v3."#sz"b, v6."#sz"b, v7."#sz"b       \n\t" \
+
+#define PACK_NO(sz) /* nothing */
+
+#define DEFINE_FAST_PATH_0_32(op,interleave,rw)                      \
+static void fastPath##op##_0_32(operation_t *o, uint32_t flags)      \
+{                                                                    \
+    op##_0_32_DECLARE_TEMPORARIES;                                   \
+    IGNORE(flags);                                                   \
+    COPY_OP_TO_LOCALS(o, uint32_t, uint32_t);                        \
+    uint32_t *dest = destBits + destPitch * destY + destX;           \
+    destPitch -= width;                                              \
+    __asm__ volatile (                                               \
+            op##_0_32_INIT                                           \
+            ""                                                       \
+    : /* Outputs */                                                  \
+            [dest]"+r"(dest) /* just so we can handle comma next */  \
+            op##_0_32_PASS_TEMPORARIES                               \
+    : /* Inputs */                                                   \
+            [halftone]"r"(halftoneBase)                              \
+    );                                                               \
+    do {                                                             \
+        uint32_t remain = width;                                     \
+        __asm__ volatile (                                           \
+                "tst     %[dest], #4                           \n\t" \
+                "b.eq    10f                                   \n\t" \
+                op##_0_32_32                                         \
+                "sub     %[x], %[x], #1                        \n\t" \
+                "10:                                           \n\t" \
+                "subs    %[x], %[x], #512/32                   \n\t" \
+                "b.mi    30f                                   \n\t" \
+                "20:                                           \n\t" \
+                LOAD_DEST_512_INTERLEAVE_##interleave                \
+                op##_0_32_512                                        \
+                STORE_DEST_512_INTERLEAVE_##interleave               \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #512/8              \n\t" \
+                "subs    %[x], %[x], #512/32                   \n\t" \
+                "b.pl    20b                                   \n\t" \
+                "30:                                           \n\t" \
+                "add     %[x], %[x], #512/32                   \n\t" \
+                "cmp     %[x], #32/32                          \n\t" \
+                "b.lo    90f                                   \n\t" \
+                "cmp     %[x], #256/32                         \n\t" \
+                "b.lo    70f                                   \n\t" \
+                "b.eq    60f                                   \n\t" \
+                "tbz     %[x], #8-5, 8f                        \n\t" \
+                "ld1     {v4.16b-v5.16b}, [%[dest]], #256/8    \n\t" \
+                "8:                                            \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "ldr     q6, [%[dest]], #128/8                 \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "ldr     d7, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 5f                        \n\t" \
+                "ldr     s17, [%[dest]], #32/8                 \n\t" \
+                "mov     v7.s[2], v17.s[0]                     \n\t" \
+                "5:                                            \n\t" \
+                "sub     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                UNPACK_DEST_##interleave(16)                         \
+                op##_0_32_512                                        \
+                PACK_##interleave(16)                                \
+                "tbz     %[x], #8-5, 8f                        \n\t" \
+                "st1     {v0.16b-v1.16b}, [%[dest]], #256/8    \n\t" \
+                "8:                                            \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "str     q2, [%[dest]], #128/8                 \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "str     d3, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 90f                       \n\t" \
+                "mov     v3.s[0], v3.s[2]                      \n\t" \
+                "str     s3, [%[dest]], #32/8                  \n\t" \
+                "b       90f                                   \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                "b       90f                                   \n\t" \
+                "60:                                           \n\t" \
+                LOAD_DEST_256_INTERLEAVE_##interleave                \
+                op##_0_32_256                                        \
+                STORE_DEST_256_INTERLEAVE_##interleave               \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #256/8              \n\t" \
+                "b       90f                                   \n\t" \
+                "70:                                           \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "ld1     {v4.8b-v5.8b}, [%[dest]], #128/8      \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "ldr     d6, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 5f                        \n\t" \
+                "ldr     s7, [%[dest]], #32/8                  \n\t" \
+                "5:                                            \n\t" \
+                "sub     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                UNPACK_DEST_##interleave(8)                          \
+                op##_0_32_256                                        \
+                PACK_##interleave(8)                                 \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "st1     {v0.8b-v1.8b}, [%[dest]], #128/8      \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "str     d2, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 90f                       \n\t" \
+                "str     s3, [%[dest]], #32/8                  \n\t" \
+                "b       90f                                   \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                "90:                                           \n\t" \
+        : /* Outputs */                                              \
+                [dest]"+r"(dest),                                    \
+                   [x]"+r"(remain)                                   \
+                op##_0_32_PASS_TEMPORARIES                           \
+        : /* Inputs */                                               \
+        : /* Clobbers */                                             \
+                "memory"                                             \
+        );                                                           \
+        dest += destPitch;                                           \
+    } while (--height > 0);                                          \
+}                                                                    \
+
+#define DEFINE_FAST_PATH_32_32(op,interleave,rw)                     \
+static void fastPath##op##_32_32(operation_t *o, uint32_t flags)     \
+{                                                                    \
+    op##_32_32_DECLARE_STACK;                                        \
+    op##_32_32_DECLARE_TEMPORARIES;                                  \
+    IGNORE(flags);                                                   \
+    COPY_OP_TO_LOCALS(o, uint32_t, uint32_t);                        \
+    uint32_t *src = srcBits + srcPitch * srcY + srcX;                \
+    uint32_t *dest = destBits + destPitch * destY + destX;           \
+    uint32_t *clut = *cmLookupTable;                                 \
+    srcPitch -= width;                                               \
+    destPitch -= width;                                              \
+    __asm__ volatile (                                               \
+            op##_32_32_INIT                                          \
+            ""                                                       \
+    : /* Outputs */                                                  \
+            [clut]"+r"(clut)                                         \
+            op##_32_32_PASS_TEMPORARIES                              \
+    : /* Inputs */                                                   \
+            [halftone]"r"(halftoneBase)                              \
+            op##_32_32_PASS_STACK                                    \
+    : /* Clobbers */                                                 \
+            "memory"                                                 \
+    );                                                               \
+    do {                                                             \
+        uint32_t remain = width;                                     \
+        __asm__ volatile (                                           \
+                "tst     %[dest], #4                           \n\t" \
+                "b.eq    10f                                   \n\t" \
+                op##_32_32_32                                        \
+                "sub     %[x], %[x], #1                        \n\t" \
+                "10:                                           \n\t" \
+                "subs    %[x], %[x], #512/32                   \n\t" \
+                "b.mi    30f                                   \n\t" \
+                "20:                                           \n\t" \
+                LOAD_SRC_512_INTERLEAVE_##interleave                 \
+                rw(LOAD_DEST_512_INTERLEAVE_##interleave)            \
+                op##_32_32_512                                       \
+                STORE_DEST_512_INTERLEAVE_##interleave               \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #512/8              \n\t" \
+                "subs    %[x], %[x], #512/32                   \n\t" \
+                "b.pl    20b                                   \n\t" \
+                "30:                                           \n\t" \
+                "add     %[x], %[x], #512/32                   \n\t" \
+                "cmp     %[x], #32/32                          \n\t" \
+                "b.lo    90f                                   \n\t" \
+                "b.eq    80f                                   \n\t" \
+                "cmp     %[x], #256/32                         \n\t" \
+                "b.lo    70f                                   \n\t" \
+                "b.eq    60f                                   \n\t" \
+                "tbz     %[x], #8-5, 8f                        \n\t" \
+                "ld1     {v0.16b-v1.16b}, [%[src]], #256/8     \n\t" \
+                rw("ld1  {v4.16b-v5.16b}, [%[dest]], #256/8    \n\t")\
+                "8:                                            \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "ldr     q2, [%[src]], #128/8                  \n\t" \
+                rw("ldr  q6, [%[dest]], #128/8                 \n\t")\
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "ldr     d3, [%[src]], #64/8                   \n\t" \
+                rw("ldr  d7, [%[dest]], #64/8                  \n\t")\
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 5f                        \n\t" \
+                "ldr     s16, [%[src]], #32/8                  \n\t" \
+                rw("ldr  s17, [%[dest]], #32/8                 \n\t")\
+                "mov     v3.s[2], v16.s[0]                     \n\t" \
+                rw("mov  v7.s[2], v17.s[0]                     \n\t")\
+                "5:                                            \n\t" \
+                rw("sub  %[dest], %[dest], %[x], lsl #2        \n\t")\
+                UNPACK_##interleave(16,rw)                           \
+                op##_32_32_512                                       \
+                PACK_##interleave(16)                                \
+                "tbz     %[x], #8-5, 8f                        \n\t" \
+                "st1     {v0.16b-v1.16b}, [%[dest]], #256/8    \n\t" \
+                "8:                                            \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "str     q2, [%[dest]], #128/8                 \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "str     d3, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 90f                       \n\t" \
+                "mov     v3.s[0], v3.s[2]                      \n\t" \
+                "str     s3, [%[dest]], #32/8                  \n\t" \
+                "b       90f                                   \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                "b       90f                                   \n\t" \
+                "60:                                           \n\t" \
+                LOAD_SRC_256_INTERLEAVE_##interleave                 \
+                rw(LOAD_DEST_256_INTERLEAVE_##interleave)            \
+                op##_32_32_256                                       \
+                STORE_DEST_256_INTERLEAVE_##interleave               \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #256/8              \n\t" \
+                "b       90f                                   \n\t" \
+                "70:                                           \n\t" \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "ld1     {v0.8b-v1.8b}, [%[src]], #128/8       \n\t" \
+                rw("ld1  {v4.8b-v5.8b}, [%[dest]], #128/8      \n\t")\
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "ldr     d2, [%[src]], #64/8                   \n\t" \
+                rw("ldr  d6, [%[dest]], #64/8                  \n\t")\
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 5f                        \n\t" \
+                "ldr     s3, [%[src]], #32/8                   \n\t" \
+                rw("ldr  s7, [%[dest]], #32/8                  \n\t")\
+                "5:                                            \n\t" \
+                rw("sub  %[dest], %[dest], %[x], lsl #2        \n\t")\
+                UNPACK_##interleave(8,rw)                            \
+                op##_32_32_256                                       \
+                PACK_##interleave(8)                                 \
+                "tbz     %[x], #7-5, 7f                        \n\t" \
+                "st1     {v0.8b-v1.8b}, [%[dest]], #128/8      \n\t" \
+                "7:                                            \n\t" \
+                "tbz     %[x], #6-5, 6f                        \n\t" \
+                "str     d2, [%[dest]], #64/8                  \n\t" \
+                "6:                                            \n\t" \
+                "tbz     %[x], #5-5, 90f                       \n\t" \
+                "str     s3, [%[dest]], #32/8                  \n\t" \
+                "b       90f                                   \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], %[x], lsl #2        \n\t" \
+                "b       90f                                   \n\t" \
+                "80:                                           \n\t" \
+                op##_32_32_32                                        \
+                "90:                                           \n\t" \
+        : /* Outputs */                                              \
+                 [src]"+r"(src),                                     \
+                [dest]"+r"(dest),                                    \
+                   [x]"+r"(remain)                                   \
+                op##_32_32_PASS_TEMPORARIES                          \
+        : /* Inputs */                                               \
+        : /* Clobbers */                                             \
+                "memory"                                             \
+        );                                                           \
+        src += srcPitch;                                             \
+        dest += destPitch;                                           \
+    } while (--height > 0);                                          \
+    __asm__ volatile (                                               \
+            op##_32_32_FINAL                                         \
+            ""                                                       \
+    : /* Outputs */                                                  \
+    : /* Inputs */                                                   \
+            [unused]"I"(0)                                           \
+            op##_32_32_PASS_STACK                                    \
+    );                                                               \
+}                                                                    \
+
+#define DEFINE_FAST_PATH(op, src_dst, interleave, rw) DEFINE_FAST_PATH_##src_dst(op, interleave, rw)
+
+/******************************************************************************/
+
+#define BitAnd_32_32_DECLARE_STACK
+
+#define BitAnd_32_32_PASS_STACK
+
+#define BitAnd_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitAnd_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitAnd_32_32_INIT
+
+#define BitAnd_32_32_FINAL
+
+#define BitAnd_32_32_32                                              \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "and     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitAnd_32_32_256                                             \
+                "and     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "and     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "and     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "and     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitAnd_32_32_512                                             \
+                "and     v0.16b, v0.16b, v4.16b                \n\t" \
+                "and     v1.16b, v1.16b, v5.16b                \n\t" \
+                "and     v2.16b, v2.16b, v6.16b                \n\t" \
+                "and     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitAnd, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitAndInvert_32_32_DECLARE_STACK
+
+#define BitAndInvert_32_32_PASS_STACK
+
+#define BitAndInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitAndInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitAndInvert_32_32_INIT
+
+#define BitAndInvert_32_32_FINAL
+
+#define BitAndInvert_32_32_32                                        \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "bic     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitAndInvert_32_32_256                                       \
+                "bic     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "bic     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "bic     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "bic     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitAndInvert_32_32_512                                       \
+                "bic     v0.16b, v0.16b, v4.16b                \n\t" \
+                "bic     v1.16b, v1.16b, v5.16b                \n\t" \
+                "bic     v2.16b, v2.16b, v6.16b                \n\t" \
+                "bic     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitAndInvert, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertAnd_32_32_DECLARE_STACK
+
+#define BitInvertAnd_32_32_PASS_STACK
+
+#define BitInvertAnd_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitInvertAnd_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitInvertAnd_32_32_INIT
+
+#define BitInvertAnd_32_32_FINAL
+
+#define BitInvertAnd_32_32_32                                        \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "bic     %w[tmp0], %w[tmp1], %w[tmp0]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitInvertAnd_32_32_256                                       \
+                "bic     v0.8b, v4.8b, v0.8b                   \n\t" \
+                "bic     v1.8b, v5.8b, v1.8b                   \n\t" \
+                "bic     v2.8b, v6.8b, v2.8b                   \n\t" \
+                "bic     v3.8b, v7.8b, v3.8b                   \n\t" \
+
+#define BitInvertAnd_32_32_512                                       \
+                "bic     v0.16b, v4.16b, v0.16b                \n\t" \
+                "bic     v1.16b, v5.16b, v1.16b                \n\t" \
+                "bic     v2.16b, v6.16b, v2.16b                \n\t" \
+                "bic     v3.16b, v7.16b, v3.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitInvertAnd, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitXor_32_32_DECLARE_STACK
+
+#define BitXor_32_32_PASS_STACK
+
+#define BitXor_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitXor_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitXor_32_32_INIT
+
+#define BitXor_32_32_FINAL
+
+#define BitXor_32_32_32                                              \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "eor     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitXor_32_32_256                                             \
+                "eor     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "eor     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "eor     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "eor     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitXor_32_32_512                                             \
+                "eor     v0.16b, v0.16b, v4.16b                \n\t" \
+                "eor     v1.16b, v1.16b, v5.16b                \n\t" \
+                "eor     v2.16b, v2.16b, v6.16b                \n\t" \
+                "eor     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitXor, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitOr_32_32_DECLARE_STACK
+
+#define BitOr_32_32_PASS_STACK
+
+#define BitOr_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitOr_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitOr_32_32_INIT
+
+#define BitOr_32_32_FINAL
+
+#define BitOr_32_32_32                                               \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "orr     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitOr_32_32_256                                              \
+                "orr     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "orr     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "orr     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "orr     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitOr_32_32_512                                              \
+                "orr     v0.16b, v0.16b, v4.16b                \n\t" \
+                "orr     v1.16b, v1.16b, v5.16b                \n\t" \
+                "orr     v2.16b, v2.16b, v6.16b                \n\t" \
+                "orr     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitOr, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertAndInvert_32_32_DECLARE_STACK
+
+#define BitInvertAndInvert_32_32_PASS_STACK
+
+#define BitInvertAndInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitInvertAndInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitInvertAndInvert_32_32_INIT
+
+#define BitInvertAndInvert_32_32_FINAL
+
+#define BitInvertAndInvert_32_32_32                                  \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "orr     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "mvn     %w[tmp0], %w[tmp0]                    \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitInvertAndInvert_32_32_256                                 \
+                "orr     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "orr     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "orr     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "orr     v3.8b, v3.8b, v7.8b                   \n\t" \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "mvn     v1.8b, v1.8b                          \n\t" \
+                "mvn     v2.8b, v2.8b                          \n\t" \
+                "mvn     v3.8b, v3.8b                          \n\t" \
+
+#define BitInvertAndInvert_32_32_512                                 \
+                "orr     v0.16b, v0.16b, v4.16b                \n\t" \
+                "orr     v1.16b, v1.16b, v5.16b                \n\t" \
+                "orr     v2.16b, v2.16b, v6.16b                \n\t" \
+                "orr     v3.16b, v3.16b, v7.16b                \n\t" \
+                "mvn     v0.16b, v0.16b                        \n\t" \
+                "mvn     v1.16b, v1.16b                        \n\t" \
+                "mvn     v2.16b, v2.16b                        \n\t" \
+                "mvn     v3.16b, v3.16b                        \n\t" \
+
+DEFINE_FAST_PATH(BitInvertAndInvert, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertXor_32_32_DECLARE_STACK
+
+#define BitInvertXor_32_32_PASS_STACK
+
+#define BitInvertXor_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitInvertXor_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitInvertXor_32_32_INIT
+
+#define BitInvertXor_32_32_FINAL
+
+#define BitInvertXor_32_32_32                                        \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "mvn     %w[tmp0], %w[tmp0]                    \n\t" \
+                "eor     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitInvertXor_32_32_256                                       \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "mvn     v1.8b, v1.8b                          \n\t" \
+                "mvn     v2.8b, v2.8b                          \n\t" \
+                "mvn     v3.8b, v3.8b                          \n\t" \
+                "eor     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "eor     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "eor     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "eor     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitInvertXor_32_32_512                                       \
+                "mvn     v0.16b, v0.16b                        \n\t" \
+                "mvn     v1.16b, v1.16b                        \n\t" \
+                "mvn     v2.16b, v2.16b                        \n\t" \
+                "mvn     v3.16b, v3.16b                        \n\t" \
+                "eor     v0.16b, v0.16b, v4.16b                \n\t" \
+                "eor     v1.16b, v1.16b, v5.16b                \n\t" \
+                "eor     v2.16b, v2.16b, v6.16b                \n\t" \
+                "eor     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitInvertXor, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertDestination_0_32_DECLARE_STACK
+
+#define BitInvertDestination_0_32_PASS_STACK
+
+#define BitInvertDestination_0_32_DECLARE_TEMPORARIES uint64_t tmp
+
+#define BitInvertDestination_0_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp)
+
+#define BitInvertDestination_0_32_INIT
+
+#define BitInvertDestination_0_32_FINAL
+
+#define BitInvertDestination_0_32_32                                 \
+                "ldr     %w[tmp], [%[dest]]                    \n\t" \
+                "mvn     %w[tmp], %w[tmp]                      \n\t" \
+                "str     %w[tmp], [%[dest]], #4                \n\t" \
+
+#define BitInvertDestination_0_32_256                                \
+                "mvn     v0.8b, v4.8b                          \n\t" \
+                "mvn     v1.8b, v5.8b                          \n\t" \
+                "mvn     v2.8b, v6.8b                          \n\t" \
+                "mvn     v3.8b, v7.8b                          \n\t" \
+
+#define BitInvertDestination_0_32_512                                \
+                "mvn     v0.16b, v4.16b                        \n\t" \
+                "mvn     v1.16b, v5.16b                        \n\t" \
+                "mvn     v2.16b, v6.16b                        \n\t" \
+                "mvn     v3.16b, v7.16b                        \n\t" \
+
+DEFINE_FAST_PATH(BitInvertDestination, 0_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitOrInvert_32_32_DECLARE_STACK
+
+#define BitOrInvert_32_32_PASS_STACK
+
+#define BitOrInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitOrInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitOrInvert_32_32_INIT
+
+#define BitOrInvert_32_32_FINAL
+
+#define BitOrInvert_32_32_32                                         \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "orn     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitOrInvert_32_32_256                                        \
+                "orn     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "orn     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "orn     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "orn     v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define BitOrInvert_32_32_512                                        \
+                "orn     v0.16b, v0.16b, v4.16b                \n\t" \
+                "orn     v1.16b, v1.16b, v5.16b                \n\t" \
+                "orn     v2.16b, v2.16b, v6.16b                \n\t" \
+                "orn     v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitOrInvert, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertSource_32_32_DECLARE_STACK
+
+#define BitInvertSource_32_32_PASS_STACK
+
+#define BitInvertSource_32_32_DECLARE_TEMPORARIES uint64_t tmp
+
+#define BitInvertSource_32_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp)
+
+#define BitInvertSource_32_32_INIT
+
+#define BitInvertSource_32_32_FINAL
+
+#define BitInvertSource_32_32_32                                     \
+                "ldr     %w[tmp], [%[src]], #4                 \n\t" \
+                "mvn     %w[tmp], %w[tmp]                      \n\t" \
+                "str     %w[tmp], [%[dest]], #4                \n\t" \
+
+#define BitInvertSource_32_32_256                                    \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "mvn     v1.8b, v1.8b                          \n\t" \
+                "mvn     v2.8b, v2.8b                          \n\t" \
+                "mvn     v3.8b, v3.8b                          \n\t" \
+
+#define BitInvertSource_32_32_512                                    \
+                "mvn     v0.16b, v0.16b                        \n\t" \
+                "mvn     v1.16b, v1.16b                        \n\t" \
+                "mvn     v2.16b, v2.16b                        \n\t" \
+                "mvn     v3.16b, v3.16b                        \n\t" \
+
+DEFINE_FAST_PATH(BitInvertSource, 32_32, NO, NO)
+
+/******************************************************************************/
+
+#define BitInvertOr_32_32_DECLARE_STACK
+
+#define BitInvertOr_32_32_PASS_STACK
+
+#define BitInvertOr_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitInvertOr_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitInvertOr_32_32_INIT
+
+#define BitInvertOr_32_32_FINAL
+
+#define BitInvertOr_32_32_32                                         \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "orn     %w[tmp0], %w[tmp1], %w[tmp0]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitInvertOr_32_32_256                                        \
+                "orn     v0.8b, v4.8b, v0.8b                   \n\t" \
+                "orn     v1.8b, v5.8b, v1.8b                   \n\t" \
+                "orn     v2.8b, v6.8b, v2.8b                   \n\t" \
+                "orn     v3.8b, v7.8b, v3.8b                   \n\t" \
+
+#define BitInvertOr_32_32_512                                        \
+                "orn     v0.16b, v4.16b, v0.16b                \n\t" \
+                "orn     v1.16b, v5.16b, v1.16b                \n\t" \
+                "orn     v2.16b, v6.16b, v2.16b                \n\t" \
+                "orn     v3.16b, v7.16b, v3.16b                \n\t" \
+
+DEFINE_FAST_PATH(BitInvertOr, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define BitInvertOrInvert_32_32_DECLARE_STACK
+
+#define BitInvertOrInvert_32_32_PASS_STACK
+
+#define BitInvertOrInvert_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define BitInvertOrInvert_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define BitInvertOrInvert_32_32_INIT
+
+#define BitInvertOrInvert_32_32_FINAL
+
+#define BitInvertOrInvert_32_32_32                                   \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "and     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "mvn     %w[tmp0], %w[tmp0]                    \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define BitInvertOrInvert_32_32_256                                  \
+                "and     v0.8b, v0.8b, v4.8b                   \n\t" \
+                "and     v1.8b, v1.8b, v5.8b                   \n\t" \
+                "and     v2.8b, v2.8b, v6.8b                   \n\t" \
+                "and     v3.8b, v3.8b, v7.8b                   \n\t" \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "mvn     v1.8b, v1.8b                          \n\t" \
+                "mvn     v2.8b, v2.8b                          \n\t" \
+                "mvn     v3.8b, v3.8b                          \n\t" \
+
+#define BitInvertOrInvert_32_32_512                                  \
+                "and     v0.16b, v0.16b, v4.16b                \n\t" \
+                "and     v1.16b, v1.16b, v5.16b                \n\t" \
+                "and     v2.16b, v2.16b, v6.16b                \n\t" \
+                "and     v3.16b, v3.16b, v7.16b                \n\t" \
+                "mvn     v0.16b, v0.16b                        \n\t" \
+                "mvn     v1.16b, v1.16b                        \n\t" \
+                "mvn     v2.16b, v2.16b                        \n\t" \
+                "mvn     v3.16b, v3.16b                        \n\t" \
+
+DEFINE_FAST_PATH(BitInvertOrInvert, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define AddWord_32_32_DECLARE_STACK
+
+#define AddWord_32_32_PASS_STACK
+
+#define AddWord_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define AddWord_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define AddWord_32_32_INIT
+
+#define AddWord_32_32_FINAL
+
+#define AddWord_32_32_32                                             \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "add     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define AddWord_32_32_256                                            \
+                "add     v0.2s, v0.2s, v4.2s                   \n\t" \
+                "add     v1.2s, v1.2s, v5.2s                   \n\t" \
+                "add     v2.2s, v2.2s, v6.2s                   \n\t" \
+                "add     v3.2s, v3.2s, v7.2s                   \n\t" \
+
+#define AddWord_32_32_512                                            \
+                "add     v0.4s, v0.4s, v4.4s                   \n\t" \
+                "add     v1.4s, v1.4s, v5.4s                   \n\t" \
+                "add     v2.4s, v2.4s, v6.4s                   \n\t" \
+                "add     v3.4s, v3.4s, v7.4s                   \n\t" \
+
+DEFINE_FAST_PATH(AddWord, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define SubWord_32_32_DECLARE_STACK
+
+#define SubWord_32_32_PASS_STACK
+
+#define SubWord_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define SubWord_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define SubWord_32_32_INIT
+
+#define SubWord_32_32_FINAL
+
+#define SubWord_32_32_32                                             \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "ldr     %w[tmp1], [%[dest]]                   \n\t" \
+                "sub     %w[tmp0], %w[tmp0], %w[tmp1]          \n\t" \
+                "str     %w[tmp0], [%[dest]], #4               \n\t" \
+
+#define SubWord_32_32_256                                            \
+                "sub     v0.2s, v0.2s, v4.2s                   \n\t" \
+                "sub     v1.2s, v1.2s, v5.2s                   \n\t" \
+                "sub     v2.2s, v2.2s, v6.2s                   \n\t" \
+                "sub     v3.2s, v3.2s, v7.2s                   \n\t" \
+
+#define SubWord_32_32_512                                            \
+                "sub     v0.4s, v0.4s, v4.4s                   \n\t" \
+                "sub     v1.4s, v1.4s, v5.4s                   \n\t" \
+                "sub     v2.4s, v2.4s, v6.4s                   \n\t" \
+                "sub     v3.4s, v3.4s, v7.4s                   \n\t" \
+
+DEFINE_FAST_PATH(SubWord, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define RgbAdd_32_32_DECLARE_STACK
+
+#define RgbAdd_32_32_PASS_STACK
+
+#define RgbAdd_32_32_DECLARE_TEMPORARIES
+
+#define RgbAdd_32_32_PASS_TEMPORARIES
+
+#define RgbAdd_32_32_INIT
+
+#define RgbAdd_32_32_FINAL
+
+#define RgbAdd_32_32_32                                              \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "uqadd   v0.8b, v0.8b, v4.8b                   \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define RgbAdd_32_32_256                                             \
+                "uqadd   v0.8b, v0.8b, v4.8b                   \n\t" \
+                "uqadd   v1.8b, v1.8b, v5.8b                   \n\t" \
+                "uqadd   v2.8b, v2.8b, v6.8b                   \n\t" \
+                "uqadd   v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define RgbAdd_32_32_512                                             \
+                "uqadd   v0.16b, v0.16b, v4.16b                \n\t" \
+                "uqadd   v1.16b, v1.16b, v5.16b                \n\t" \
+                "uqadd   v2.16b, v2.16b, v6.16b                \n\t" \
+                "uqadd   v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(RgbAdd, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define RgbSub_32_32_DECLARE_STACK
+
+#define RgbSub_32_32_PASS_STACK
+
+#define RgbSub_32_32_DECLARE_TEMPORARIES
+
+#define RgbSub_32_32_PASS_TEMPORARIES
+
+#define RgbSub_32_32_INIT
+
+#define RgbSub_32_32_FINAL
+
+#define RgbSub_32_32_32                                              \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "uabd    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define RgbSub_32_32_256                                             \
+                "uabd    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "uabd    v1.8b, v1.8b, v5.8b                   \n\t" \
+                "uabd    v2.8b, v2.8b, v6.8b                   \n\t" \
+                "uabd    v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define RgbSub_32_32_512                                             \
+                "uabd    v0.16b, v0.16b, v4.16b                \n\t" \
+                "uabd    v1.16b, v1.16b, v5.16b                \n\t" \
+                "uabd    v2.16b, v2.16b, v6.16b                \n\t" \
+                "uabd    v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(RgbSub, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define AlphaBlend_32_32_DECLARE_STACK
+
+#define AlphaBlend_32_32_PASS_STACK
+
+#define AlphaBlend_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define AlphaBlend_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define AlphaBlend_32_32_INIT                                        \
+                "movi    v31.16b, #0xFF                        \n\t" \
+                "ldr     s30, =0xFF000000                      \n\t" \
+
+#define AlphaBlend_32_32_FINAL
+
+#define AlphaBlend_32_32_32                                          \
+                "ldrb    %w[tmp0], [%[src], #3]                \n\t" \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "cbz     %[tmp0], 2f                           \n\t" \
+                "cmp     %[tmp0], #0xFF                        \n\t" \
+                "b.eq    1f                                    \n\t" \
+                "ldr     s1, [%[dest]]                         \n\t" \
+                "orr     v2.8b, v30.8b, v0.8b                  \n\t" \
+                "mvn     v3.8b, v0.8b                          \n\t" \
+                "dup     v0.8b, v0.b[3]                        \n\t" \
+                "dup     v3.8b, v3.b[3]                        \n\t" \
+                "umull   v0.8h, v2.8b, v0.8b                   \n\t" \
+                "umlal   v0.8h, v1.8b, v3.8b                   \n\t" \
+                "cmtst   v1.4h, v0.4h, v31.4h                  \n\t" \
+                "usra    v0.4h, v0.4h, #8                      \n\t" \
+                "sub     v0.8b, v0.8b, v1.8b                   \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "1:                                            \n\t" \
+                "str     s0, [%[dest]]                         \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #4                  \n\t" \
+
+#define AlphaBlend_32_32_256                                         \
+                "mov     %[tmp0], v3.d[0]                      \n\t" \
+                "cbz     %[tmp0], 2f                           \n\t" \
+                "cmp     %[tmp0], #-1                          \n\t" \
+                "b.eq    1f                                    \n\t" \
+                "mvn     v16.8b, v3.8b                         \n\t" \
+                "umull   v0.8h, v0.8b, v3.8b                   \n\t" \
+                "umlal   v0.8h, v4.8b, v16.8b                  \n\t" \
+                "umull   v1.8h, v1.8b, v3.8b                   \n\t" \
+                "umlal   v1.8h, v5.8b, v16.8b                  \n\t" \
+                "umull   v2.8h, v2.8b, v3.8b                   \n\t" \
+                "umlal   v2.8h, v6.8b, v16.8b                  \n\t" \
+                "umull   v3.8h, v31.8b, v3.8b                  \n\t" \
+                "umlal   v3.8h, v7.8b, v16.8b                  \n\t" \
+                "cmtst   v4.8h, v0.8h, v31.8h                  \n\t" \
+                "cmtst   v5.8h, v1.8h, v31.8h                  \n\t" \
+                "cmtst   v6.8h, v2.8h, v31.8h                  \n\t" \
+                "cmtst   v7.8h, v3.8h, v31.8h                  \n\t" \
+                "usra    v0.8h, v0.8h, #8                      \n\t" \
+                "usra    v1.8h, v1.8h, #8                      \n\t" \
+                "usra    v2.8h, v2.8h, #8                      \n\t" \
+                "usra    v3.8h, v3.8h, #8                      \n\t" \
+                "sub     v0.16b, v0.16b, v4.16b                \n\t" \
+                "sub     v1.16b, v1.16b, v5.16b                \n\t" \
+                "sub     v2.16b, v2.16b, v6.16b                \n\t" \
+                "sub     v3.16b, v3.16b, v7.16b                \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "shrn    v1.8b, v1.8h, #8                      \n\t" \
+                "shrn    v2.8b, v2.8h, #8                      \n\t" \
+                "shrn    v3.8b, v3.8h, #8                      \n\t" \
+                "1:                                            \n\t" \
+
+#define AlphaBlend_32_32_512                                         \
+                "mov     %[tmp0], v3.d[0]                      \n\t" \
+                "mov     %[tmp1], v3.d[1]                      \n\t" \
+                "cmp     %[tmp0], #0                           \n\t" \
+                "ccmp    %[tmp1], #0, #0, eq                   \n\t" \
+                "b.eq    2f                                    \n\t" \
+                "cmn     %[tmp0], #1                           \n\t" \
+                "ccmn    %[tmp1], #1, #0, eq                   \n\t" \
+                "b.eq    1f                                    \n\t" \
+                "mvn     v16.16b, v3.16b                       \n\t" \
+                "umull   v20.8h, v0.8b, v3.8b                  \n\t" \
+                "umlal   v20.8h, v4.8b, v16.8b                 \n\t" \
+                "umull2  v21.8h, v0.16b, v3.16b                \n\t" \
+                "umlal2  v21.8h, v4.16b, v16.16b               \n\t" \
+                "umull   v22.8h, v1.8b, v3.8b                  \n\t" \
+                "umlal   v22.8h, v5.8b, v16.8b                 \n\t" \
+                "umull2  v23.8h, v1.16b, v3.16b                \n\t" \
+                "umlal2  v23.8h, v5.16b, v16.16b               \n\t" \
+                "umull   v24.8h, v2.8b, v3.8b                  \n\t" \
+                "umlal   v24.8h, v6.8b, v16.8b                 \n\t" \
+                "umull2  v25.8h, v2.16b, v3.16b                \n\t" \
+                "umlal2  v25.8h, v6.16b, v16.16b               \n\t" \
+                "umull   v26.8h, v31.8b, v3.8b                 \n\t" \
+                "umlal   v26.8h, v7.8b, v16.8b                 \n\t" \
+                "umull2  v27.8h, v31.16b, v3.16b               \n\t" \
+                "umlal2  v27.8h, v7.16b, v16.16b               \n\t" \
+                "cmtst   v0.8h, v20.8h, v31.8h                 \n\t" \
+                "cmtst   v1.8h, v21.8h, v31.8h                 \n\t" \
+                "cmtst   v2.8h, v22.8h, v31.8h                 \n\t" \
+                "cmtst   v3.8h, v23.8h, v31.8h                 \n\t" \
+                "cmtst   v4.8h, v24.8h, v31.8h                 \n\t" \
+                "cmtst   v5.8h, v25.8h, v31.8h                 \n\t" \
+                "cmtst   v6.8h, v26.8h, v31.8h                 \n\t" \
+                "cmtst   v7.8h, v27.8h, v31.8h                 \n\t" \
+                "usra    v20.8h, v20.8h, #8                    \n\t" \
+                "usra    v21.8h, v21.8h, #8                    \n\t" \
+                "usra    v22.8h, v22.8h, #8                    \n\t" \
+                "usra    v23.8h, v23.8h, #8                    \n\t" \
+                "usra    v24.8h, v24.8h, #8                    \n\t" \
+                "usra    v25.8h, v25.8h, #8                    \n\t" \
+                "usra    v26.8h, v26.8h, #8                    \n\t" \
+                "usra    v27.8h, v27.8h, #8                    \n\t" \
+                "sub     v0.16b, v20.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v21.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v22.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v23.16b, v3.16b               \n\t" \
+                "sub     v4.16b, v24.16b, v4.16b               \n\t" \
+                "sub     v5.16b, v25.16b, v5.16b               \n\t" \
+                "sub     v6.16b, v26.16b, v6.16b               \n\t" \
+                "sub     v7.16b, v27.16b, v7.16b               \n\t" \
+                "shrn    v0.8b,  v0.8h, #8                     \n\t" \
+                "shrn    v20.8b, v1.8h, #8                     \n\t" \
+                "shrn    v1.8b,  v2.8h, #8                     \n\t" \
+                "shrn    v21.8b, v3.8h, #8                     \n\t" \
+                "shrn    v2.8b,  v4.8h, #8                     \n\t" \
+                "shrn    v22.8b, v5.8h, #8                     \n\t" \
+                "shrn    v3.8b,  v6.8h, #8                     \n\t" \
+                "shrn    v23.8b, v7.8h, #8                     \n\t" \
+                "mov     v0.d[1], v20.d[0]                     \n\t" \
+                "mov     v1.d[1], v21.d[0]                     \n\t" \
+                "mov     v2.d[1], v22.d[0]                     \n\t" \
+                "mov     v3.d[1], v23.d[0]                     \n\t" \
+                "1:                                            \n\t" \
+
+DEFINE_FAST_PATH(AlphaBlend, 32_32, YES, YES)
+
+/******************************************************************************/
+
+#define PixPaint_32_32_DECLARE_STACK
+
+#define PixPaint_32_32_PASS_STACK
+
+#define PixPaint_32_32_DECLARE_TEMPORARIES uint64_t tmp0, tmp1
+
+#define PixPaint_32_32_PASS_TEMPORARIES , [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1)
+
+#define PixPaint_32_32_INIT
+
+#define PixPaint_32_32_FINAL
+
+#define PixPaint_32_32_32                                            \
+                "ldr     %w[tmp0], [%[src]], #4                \n\t" \
+                "cbz     %[tmp0], 2f                           \n\t" \
+                "str     %w[tmp0], [%[dest]]                   \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #4                  \n\t" \
+
+#define PixPaint_32_32_256                                           \
+                "orr     v16.8b, v0.8b, v1.8b                  \n\t" \
+                "orr     v17.8b, v2.8b, v3.8b                  \n\t" \
+                "orr     v16.8b, v16.8b, v17.8b                \n\t" \
+                "mov     %[tmp0], v16.d[0]                     \n\t" \
+                "cbz     %[tmp0], 2f                           \n\t" \
+                "cmeq    v16.2s, v0.2s, #0                     \n\t" \
+                "cmeq    v17.2s, v1.2s, #0                     \n\t" \
+                "cmeq    v18.2s, v2.2s, #0                     \n\t" \
+                "cmeq    v19.2s, v3.2s, #0                     \n\t" \
+                "bit     v0.8b, v4.8b, v16.8b                  \n\t" \
+                "bit     v1.8b, v5.8b, v17.8b                  \n\t" \
+                "bit     v2.8b, v6.8b, v18.8b                  \n\t" \
+                "bit     v3.8b, v7.8b, v19.8b                  \n\t" \
+
+#define PixPaint_32_32_512                                           \
+                "orr     v16.16b, v0.16b, v1.16b               \n\t" \
+                "orr     v17.16b, v2.16b, v3.16b               \n\t" \
+                "orr     v16.16b, v16.16b, v17.16b             \n\t" \
+                "mov     %[tmp0], v16.d[0]                     \n\t" \
+                "mov     %[tmp1], v16.d[1]                     \n\t" \
+                "orr     %[tmp0], %[tmp0], %[tmp1]             \n\t" \
+                "cbz     %[tmp0], 2f                           \n\t" \
+                "cmeq    v16.4s, v0.4s, #0                     \n\t" \
+                "cmeq    v17.4s, v1.4s, #0                     \n\t" \
+                "cmeq    v18.4s, v2.4s, #0                     \n\t" \
+                "cmeq    v19.4s, v3.4s, #0                     \n\t" \
+                "bit     v0.16b, v4.16b, v16.16b               \n\t" \
+                "bit     v1.16b, v5.16b, v17.16b               \n\t" \
+                "bit     v2.16b, v6.16b, v18.16b               \n\t" \
+                "bit     v3.16b, v7.16b, v19.16b               \n\t" \
+
+DEFINE_FAST_PATH(PixPaint, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define PixMask_32_32_DECLARE_STACK
+
+#define PixMask_32_32_PASS_STACK
+
+#define PixMask_32_32_DECLARE_TEMPORARIES uint64_t tmp
+
+#define PixMask_32_32_PASS_TEMPORARIES , [tmp]"=&r"(tmp)
+
+#define PixMask_32_32_INIT
+
+#define PixMask_32_32_FINAL
+
+#define PixMask_32_32_32                                             \
+                "ldr     %w[tmp], [%[src]], #4                 \n\t" \
+                "cbz     %[tmp], 2f                            \n\t" \
+                "str     wzr, [%[dest]]                        \n\t" \
+                "2:                                            \n\t" \
+                "add     %[dest], %[dest], #4                  \n\t" \
+
+#define PixMask_32_32_256                                            \
+                "cmeq    v16.2s, v0.2s, #0                     \n\t" \
+                "cmeq    v17.2s, v1.2s, #0                     \n\t" \
+                "cmeq    v18.2s, v2.2s, #0                     \n\t" \
+                "cmeq    v19.2s, v3.2s, #0                     \n\t" \
+                "movi    v0.8b, #0                             \n\t" \
+                "movi    v1.8b, #0                             \n\t" \
+                "movi    v2.8b, #0                             \n\t" \
+                "movi    v3.8b, #0                             \n\t" \
+                "bit     v0.8b, v4.8b, v16.8b                  \n\t" \
+                "bit     v1.8b, v5.8b, v17.8b                  \n\t" \
+                "bit     v2.8b, v6.8b, v18.8b                  \n\t" \
+                "bit     v3.8b, v7.8b, v19.8b                  \n\t" \
+
+#define PixMask_32_32_512                                            \
+                "cmeq    v16.4s, v0.4s, #0                     \n\t" \
+                "cmeq    v17.4s, v1.4s, #0                     \n\t" \
+                "cmeq    v18.4s, v2.4s, #0                     \n\t" \
+                "cmeq    v19.4s, v3.4s, #0                     \n\t" \
+                "movi    v0.16b, #0                            \n\t" \
+                "movi    v1.16b, #0                            \n\t" \
+                "movi    v2.16b, #0                            \n\t" \
+                "movi    v3.16b, #0                            \n\t" \
+                "bit     v0.16b, v4.16b, v16.16b               \n\t" \
+                "bit     v1.16b, v5.16b, v17.16b               \n\t" \
+                "bit     v2.16b, v6.16b, v18.16b               \n\t" \
+                "bit     v3.16b, v7.16b, v19.16b               \n\t" \
+
+DEFINE_FAST_PATH(PixMask, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define RgbMax_32_32_DECLARE_STACK
+
+#define RgbMax_32_32_PASS_STACK
+
+#define RgbMax_32_32_DECLARE_TEMPORARIES
+
+#define RgbMax_32_32_PASS_TEMPORARIES
+
+#define RgbMax_32_32_INIT
+
+#define RgbMax_32_32_FINAL
+
+#define RgbMax_32_32_32                                              \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "umax    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define RgbMax_32_32_256                                             \
+                "umax    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "umax    v1.8b, v1.8b, v5.8b                   \n\t" \
+                "umax    v2.8b, v2.8b, v6.8b                   \n\t" \
+                "umax    v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define RgbMax_32_32_512                                             \
+                "umax    v0.16b, v0.16b, v4.16b                \n\t" \
+                "umax    v1.16b, v1.16b, v5.16b                \n\t" \
+                "umax    v2.16b, v2.16b, v6.16b                \n\t" \
+                "umax    v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(RgbMax, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define RgbMin_32_32_DECLARE_STACK
+
+#define RgbMin_32_32_PASS_STACK
+
+#define RgbMin_32_32_DECLARE_TEMPORARIES
+
+#define RgbMin_32_32_PASS_TEMPORARIES
+
+#define RgbMin_32_32_INIT
+
+#define RgbMin_32_32_FINAL
+
+#define RgbMin_32_32_32                                              \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "umin    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define RgbMin_32_32_256                                             \
+                "umin    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "umin    v1.8b, v1.8b, v5.8b                   \n\t" \
+                "umin    v2.8b, v2.8b, v6.8b                   \n\t" \
+                "umin    v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define RgbMin_32_32_512                                             \
+                "umin    v0.16b, v0.16b, v4.16b                \n\t" \
+                "umin    v1.16b, v1.16b, v5.16b                \n\t" \
+                "umin    v2.16b, v2.16b, v6.16b                \n\t" \
+                "umin    v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(RgbMin, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define RgbMinInvert_32_32_DECLARE_STACK
+
+#define RgbMinInvert_32_32_PASS_STACK
+
+#define RgbMinInvert_32_32_DECLARE_TEMPORARIES
+
+#define RgbMinInvert_32_32_PASS_TEMPORARIES
+
+#define RgbMinInvert_32_32_INIT
+
+#define RgbMinInvert_32_32_FINAL
+
+#define RgbMinInvert_32_32_32                                        \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "umin    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define RgbMinInvert_32_32_256                                       \
+                "mvn     v0.8b, v0.8b                          \n\t" \
+                "mvn     v1.8b, v1.8b                          \n\t" \
+                "mvn     v2.8b, v2.8b                          \n\t" \
+                "mvn     v3.8b, v3.8b                          \n\t" \
+                "umin    v0.8b, v0.8b, v4.8b                   \n\t" \
+                "umin    v1.8b, v1.8b, v5.8b                   \n\t" \
+                "umin    v2.8b, v2.8b, v6.8b                   \n\t" \
+                "umin    v3.8b, v3.8b, v7.8b                   \n\t" \
+
+#define RgbMinInvert_32_32_512                                       \
+                "mvn     v0.16b, v0.16b                        \n\t" \
+                "mvn     v1.16b, v1.16b                        \n\t" \
+                "mvn     v2.16b, v2.16b                        \n\t" \
+                "mvn     v3.16b, v3.16b                        \n\t" \
+                "umin    v0.16b, v0.16b, v4.16b                \n\t" \
+                "umin    v1.16b, v1.16b, v5.16b                \n\t" \
+                "umin    v2.16b, v2.16b, v6.16b                \n\t" \
+                "umin    v3.16b, v3.16b, v7.16b                \n\t" \
+
+DEFINE_FAST_PATH(RgbMinInvert, 32_32, NO, YES)
+
+/******************************************************************************/
+
+#define AlphaBlendConst_32_32_DECLARE_STACK
+
+#define AlphaBlendConst_32_32_PASS_STACK
+
+#define AlphaBlendConst_32_32_DECLARE_TEMPORARIES uint32_t alpha = o->opt.sourceAlpha, unAlpha = alpha ^ 0xFF
+
+#define AlphaBlendConst_32_32_PASS_TEMPORARIES , [alpha]"+r"(alpha), [unAlpha]"+r"(unAlpha)
+
+#define AlphaBlendConst_32_32_INIT                                   \
+                "dup     v28.16b, %w[alpha]                    \n\t" \
+                "dup     v29.16b, %w[unAlpha]                  \n\t" \
+                "movi    v31.16b, #0xFF                        \n\t" \
+
+#define AlphaBlendConst_32_32_FINAL
+
+#define AlphaBlendConst_32_32_32                                     \
+                "ldr     s0, [%[src]], #4                      \n\t" \
+                "ldr     s4, [%[dest]]                         \n\t" \
+                "umull   v16.8h, v0.8b, v28.8b                 \n\t" \
+                "umlal   v16.8h, v4.8b, v29.8b                 \n\t" \
+                "cmtst   v0.8h, v16.8h, v31.8h                 \n\t" \
+                "usra    v16.8h, v16.8h, #8                    \n\t" \
+                "sub     v0.16b, v16.16b, v0.16b               \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "str     s0, [%[dest]], #4                     \n\t" \
+
+#define AlphaBlendConst_32_32_256                                    \
+                "umull   v16.8h, v0.8b, v28.8b                 \n\t" \
+                "umlal   v16.8h, v4.8b, v29.8b                 \n\t" \
+                "umull   v17.8h, v1.8b, v28.8b                 \n\t" \
+                "umlal   v17.8h, v5.8b, v29.8b                 \n\t" \
+                "umull   v18.8h, v2.8b, v28.8b                 \n\t" \
+                "umlal   v18.8h, v6.8b, v29.8b                 \n\t" \
+                "umull   v19.8h, v3.8b, v28.8b                 \n\t" \
+                "umlal   v19.8h, v7.8b, v29.8b                 \n\t" \
+                "cmtst   v0.8h, v16.8h, v31.8h                 \n\t" \
+                "cmtst   v1.8h, v17.8h, v31.8h                 \n\t" \
+                "cmtst   v2.8h, v18.8h, v31.8h                 \n\t" \
+                "cmtst   v3.8h, v19.8h, v31.8h                 \n\t" \
+                "usra    v16.8h, v16.8h, #8                    \n\t" \
+                "usra    v17.8h, v17.8h, #8                    \n\t" \
+                "usra    v18.8h, v18.8h, #8                    \n\t" \
+                "usra    v19.8h, v19.8h, #8                    \n\t" \
+                "sub     v0.16b, v16.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v17.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v18.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v19.16b, v3.16b               \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "shrn    v1.8b, v1.8h, #8                      \n\t" \
+                "shrn    v2.8b, v2.8h, #8                      \n\t" \
+                "shrn    v3.8b, v3.8h, #8                      \n\t" \
+
+#define AlphaBlendConst_32_32_512                                    \
+                "umull   v16.8h, v0.8b,  v28.8b                \n\t" \
+                "umlal   v16.8h, v4.8b,  v29.8b                \n\t" \
+                "umull2  v17.8h, v0.16b, v28.16b               \n\t" \
+                "umlal2  v17.8h, v4.16b, v29.16b               \n\t" \
+                "umull   v18.8h, v1.8b,  v28.8b                \n\t" \
+                "umlal   v18.8h, v5.8b,  v29.8b                \n\t" \
+                "umull2  v19.8h, v1.16b, v28.16b               \n\t" \
+                "umlal2  v19.8h, v5.16b, v29.16b               \n\t" \
+                "umull   v20.8h, v2.8b,  v28.8b                \n\t" \
+                "umlal   v20.8h, v6.8b,  v29.8b                \n\t" \
+                "umull2  v21.8h, v2.16b, v28.16b               \n\t" \
+                "umlal2  v21.8h, v6.16b, v29.16b               \n\t" \
+                "umull   v22.8h, v3.8b,  v28.8b                \n\t" \
+                "umlal   v22.8h, v7.8b,  v29.8b                \n\t" \
+                "umull2  v23.8h, v3.16b, v28.16b               \n\t" \
+                "umlal2  v23.8h, v7.16b, v29.16b               \n\t" \
+                "cmtst   v0.8h, v16.8h, v31.8h                 \n\t" \
+                "cmtst   v1.8h, v17.8h, v31.8h                 \n\t" \
+                "cmtst   v2.8h, v18.8h, v31.8h                 \n\t" \
+                "cmtst   v3.8h, v19.8h, v31.8h                 \n\t" \
+                "cmtst   v4.8h, v20.8h, v31.8h                 \n\t" \
+                "cmtst   v5.8h, v21.8h, v31.8h                 \n\t" \
+                "cmtst   v6.8h, v22.8h, v31.8h                 \n\t" \
+                "cmtst   v7.8h, v23.8h, v31.8h                 \n\t" \
+                "usra    v16.8h, v16.8h, #8                    \n\t" \
+                "usra    v17.8h, v17.8h, #8                    \n\t" \
+                "usra    v18.8h, v18.8h, #8                    \n\t" \
+                "usra    v19.8h, v19.8h, #8                    \n\t" \
+                "usra    v20.8h, v20.8h, #8                    \n\t" \
+                "usra    v21.8h, v21.8h, #8                    \n\t" \
+                "usra    v22.8h, v22.8h, #8                    \n\t" \
+                "usra    v23.8h, v23.8h, #8                    \n\t" \
+                "sub     v0.16b, v16.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v17.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v18.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v19.16b, v3.16b               \n\t" \
+                "sub     v4.16b, v20.16b, v4.16b               \n\t" \
+                "sub     v5.16b, v21.16b, v5.16b               \n\t" \
+                "sub     v6.16b, v22.16b, v6.16b               \n\t" \
+                "sub     v7.16b, v23.16b, v7.16b               \n\t" \
+                "shrn    v0.8b,  v0.8h, #8                     \n\t" \
+                "shrn    v16.8b, v1.8h, #8                     \n\t" \
+                "shrn    v1.8b,  v2.8h, #8                     \n\t" \
+                "shrn    v17.8b, v3.8h, #8                     \n\t" \
+                "shrn    v2.8b,  v4.8h, #8                     \n\t" \
+                "shrn    v18.8b, v5.8h, #8                     \n\t" \
+                "shrn    v3.8b,  v6.8h, #8                     \n\t" \
+                "shrn    v19.8b, v7.8h, #8                     \n\t" \
+                "mov     v0.d[1], v16.d[0]                     \n\t" \
+                "mov     v1.d[1], v17.d[0]                     \n\t" \
+                "mov     v2.d[1], v18.d[0]                     \n\t" \
+                "mov     v3.d[1], v19.d[0]                     \n\t" \
+
+DEFINE_FAST_PATH(AlphaBlendConst, 32_32, NO, YES)
+
+/******************************************************************************/
+
+static void fastPathSourceWord_1_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    const unsigned srcBPP = 1;
+    const unsigned destBPP = 32;
+    uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+
+    uint32_t *clut = *cmLookupTable;
+    __asm__ volatile (
+            "ldr     d6, =0x10000000001                        \n\t"
+            "ld1r    {v16.4s}, [%[clut]], #4                   \n\t"
+            "ld1r    {v17.4s}, [%[clut]]                       \n\t"
+            "ldr     q18, =0x10000000200000004000000080000000  \n\t"
+            "ldr     q19, =0x01000000020000000400000008000000  \n\t"
+    : /* Outputs */
+            [clut]"+r"(clut)
+    );
+
+    do {
+        uint32_t *s = src;
+        /* Aim to end the inner loop at a 64-bit boundary in destination image. */
+        uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63;
+        uint32_t remain = width - trailing_dest_bits / destBPP;
+        /* The inner loop works in chunks of 256 bits at the destination image. */
+        /* Find the number of pixels that precede the first iteration of the inner loop. */
+        uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP;
+        /* We define skew as the number of bits into the first or only source image
+         * word (counting from the MS bit) corresponding to the first pixel of an
+         * iteration of the inner loop. Where the ratio between source and destination
+         * depths is greater than 8, this number will change between iterations. */
+        uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31;
+        uint32_t unskew = (-skew) & 31;
+
+        /* Handle leading pixels, if any */
+        uint32_t data, carry;
+        if (leading_src_bits == 0) {
+            if (skew == 0)
+                carry = 0;
+            else
+                carry = *s++ << skew;
+            __asm__ volatile (
+                    "mov     v4.s[1], %w[carry]                \n\t"
+            : /* Outputs */
+            : /* Inputs */
+                    [carry]"r"(carry)
+            );
+        } else {
+            if (skew == 0) {
+                data = *s++;
+                carry = 0;
+            }
+            else if (leading_src_bits > skew) {
+                data = *s++ << skew;
+                carry = *s++;
+                data |= carry >> unskew;
+                carry <<= skew;
+            } else {
+                carry = *s++;
+                data = carry >> unskew;
+                carry <<= skew;
+            }
+            data <<= 32 - leading_src_bits;
+
+            __asm__ volatile (
+                    "mov     v4.s[0], %w[data]                 \n\t"
+                    "mov     v4.s[1], %w[carry]                \n\t"
+
+                    /* Process 1..7 pixels */
+                    "dup     v3.4s, v4.s[0]                    \n\t"
+                    "cmtst   v0.4s, v3.4s, v18.4s              \n\t"
+                    "cmtst   v1.4s, v3.4s, v19.4s              \n\t"
+                    "bsl     v0.16b, v17.16b, v16.16b          \n\t"
+                    "bsl     v1.16b, v17.16b, v16.16b          \n\t"
+                    "tst     %[bits], #4                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     q0, [%[dest]], #16                \n\t"
+                    "mov     v0.16b, v1.16b                    \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #2                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     d0, [%[dest]], #8                 \n\t"
+                    "mov     v0.d[0], v0.d[1]                  \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #1                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     s0, [%[dest]], #4                 \n\t"
+                    "1:                                        \n\t"
+            : /* Outputs */
+                     [dest]"+r"(dest)
+            : /* Inputs */
+                     [data]"r"(data),
+                    [carry]"r"(carry),
+                     [bits]"r"(leading_src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+        }
+
+        uint32_t src_bits = srcBPP*256/destBPP;
+        uint32_t unskew2 = -(unskew & (src_bits - 1));
+        uint32_t skew2 = src_bits + unskew2;
+        __asm__ volatile (
+                "mov     v5.s[0], %w[unskew2]                  \n\t"
+                "mov     v5.s[1], %w[skew2]                    \n\t"
+        : /* Outputs */
+        : /* Inputs */
+                [unskew2]"r"(unskew2),
+                  [skew2]"r"(skew2)
+        );
+
+        /* Inner loop */
+        for (; remain >= 256/destBPP; remain -= 256/destBPP) {
+            __asm__ volatile (
+                    /* Load source word (if necessary) and shuffle */
+                    "cmp     %[unskew], %[src_bits]            \n\t"
+                    "b.hs    1f                                \n\t"
+                    "ushr    d0, d4, #32                       \n\t"
+                    "ld1r    {v4.2s}, [%[s]], #4               \n\t"
+                    "ushl    v4.2s, v4.2s, v5.2s               \n\t"
+                    "orr     v4.8b, v4.8b, v0.8b               \n\t"
+                    "b       2f                                \n\t"
+                    "1:                                        \n\t"
+                    "mul     v4.2s, v6.2s, v4.s[1]             \n\t"
+                    "2:                                        \n\t"
+
+                    /* Process 8 pixels */
+                    "dup     v3.4s, v4.s[0]                    \n\t"
+                    "cmtst   v0.4s, v3.4s, v18.4s              \n\t"
+                    "cmtst   v1.4s, v3.4s, v19.4s              \n\t"
+                    "bsl     v0.16b, v17.16b, v16.16b          \n\t"
+                    "bsl     v1.16b, v17.16b, v16.16b          \n\t"
+                    "st1     {v0.16b-v1.16b}, [%[dest]], #32   \n\t"
+            : /* Outputs */
+                       [s]"+r"(s),
+                    [dest]"+r"(dest)
+            : /* Inputs */
+                      [unskew]"r"(unskew),
+                    [src_bits]"I"(src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+            unskew = (unskew - src_bits) & 31;
+        }
+
+        /* Handle trailing pixel, if any */
+        if (trailing_dest_bits) {
+            if (unskew == 0)
+                data = *s++;
+            else
+                __asm__ volatile (
+                        "mov     %w[data], v4.s[1]             \n\t"
+                : /* Outputs */
+                        [data]"=r"(data)
+                );
+            *dest++ = (*cmLookupTable)[data >> 31];
+        }
+
+        src += srcPitch;
+        dest += destPitch - width;
+    } while (--height > 0);
+}
+
+/******************************************************************************/
+
+static void fastPathSourceWord_2_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    const unsigned srcBPP = 2;
+    const unsigned destBPP = 32;
+    uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+
+    uint32_t *clut = *cmLookupTable;
+    __asm__ volatile (
+            "ldr     d6, =0x1000000000001                      \n\t"
+            "ld4     {v16.b-v19.b}[0], [%[clut]], #4           \n\t"
+            "ld4     {v16.b-v19.b}[1], [%[clut]], #4           \n\t"
+            "ld4     {v16.b-v19.b}[2], [%[clut]], #4           \n\t"
+            "ld4     {v16.b-v19.b}[3], [%[clut]]               \n\t"
+            "ldr     d20, =0x00FEFCFA00FEFCFA                  \n\t"
+            "ldr     d21, =0x0303030303030303                  \n\t"
+    : /* Outputs */
+            [clut]"+r"(clut)
+    );
+
+    do {
+        uint32_t *s = src;
+        /* Aim to end the inner loop at a 64-bit boundary in destination image. */
+        uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63;
+        uint32_t remain = width - trailing_dest_bits / destBPP;
+        /* The inner loop works in chunks of 256 bits at the destination image. */
+        /* Find the number of pixels that precede the first iteration of the inner loop. */
+        uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP;
+        /* We define skew as the number of bits into the first or only source image
+         * word (counting from the MS bit) corresponding to the first pixel of an
+         * iteration of the inner loop. Where the ratio between source and destination
+         * depths is greater than 8, this number will change between iterations. */
+        uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31;
+        uint32_t unskew = (-skew) & 31;
+
+        /* Handle leading pixels, if any */
+        uint32_t data, carry;
+        if (leading_src_bits == 0) {
+            if (skew == 0)
+                carry = 0;
+            else
+                carry = *s++ << skew;
+            __asm__ volatile (
+                    "mov     v4.s[1], %w[carry]                \n\t"
+            : /* Outputs */
+            : /* Inputs */
+                    [carry]"r"(carry)
+            );
+        } else {
+            if (skew == 0) {
+                data = *s++;
+                carry = 0;
+            }
+            else if (leading_src_bits > skew) {
+                data = *s++ << skew;
+                carry = *s++;
+                data |= carry >> unskew;
+                carry <<= skew;
+            } else {
+                carry = *s++;
+                data = carry >> unskew;
+                carry <<= skew;
+            }
+            data <<= 32 - leading_src_bits;
+
+            __asm__ volatile (
+                    "mov     v4.s[0], %w[data]                 \n\t"
+                    "mov     v4.s[1], %w[carry]                \n\t"
+
+                    /* Process 1..7 pixels */
+                    "dup     v2.8b, v4.b[3]                    \n\t"
+                    "dup     v3.8b, v4.b[2]                    \n\t"
+                    "ext     v3.8b, v2.8b, v3.8b, #4           \n\t"
+                    "ushl    v3.8b, v3.8b, v20.8b              \n\t"
+                    "and     v3.8b, v3.8b, v21.8b              \n\t"
+                    "tbl     v0.8b, {v16.16b}, v3.8b           \n\t"
+                    "tbl     v1.8b, {v17.16b}, v3.8b           \n\t"
+                    "tbl     v2.8b, {v18.16b}, v3.8b           \n\t"
+                    "tbl     v3.8b, {v19.16b}, v3.8b           \n\t"
+                    "zip1    v24.8b, v0.8b, v2.8b              \n\t"
+                    "zip2    v26.8b, v0.8b, v2.8b              \n\t"
+                    "zip1    v25.8b, v1.8b, v3.8b              \n\t"
+                    "zip2    v27.8b, v1.8b, v3.8b              \n\t"
+                    "zip1    v0.8b, v24.8b, v25.8b             \n\t"
+                    "zip2    v1.8b, v24.8b, v25.8b             \n\t"
+                    "zip1    v2.8b, v26.8b, v27.8b             \n\t"
+                    "zip2    v3.8b, v26.8b, v27.8b             \n\t"
+                    "tst     %[bits], #8                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "st1     {v0.8b-v1.8b}, [%[dest]], #16     \n\t"
+                    "mov     v0.8b, v2.8b                      \n\t"
+                    "mov     v1.8b, v3.8b                      \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #4                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     d0, [%[dest]], #8                 \n\t"
+                    "mov     v0.8b, v1.8b                      \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #2                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     s0, [%[dest]], #4                 \n\t"
+                    "1:                                        \n\t"
+            : /* Outputs */
+                     [dest]"+r"(dest)
+            : /* Inputs */
+                     [data]"r"(data),
+                    [carry]"r"(carry),
+                     [bits]"r"(leading_src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+        }
+
+        uint32_t src_bits = srcBPP*256/destBPP;
+        uint32_t unskew2 = -(unskew & (src_bits - 1));
+        uint32_t skew2 = src_bits + unskew2;
+        __asm__ volatile (
+                "mov     v5.s[0], %w[unskew2]                  \n\t"
+                "mov     v5.s[1], %w[skew2]                    \n\t"
+        : /* Outputs */
+        : /* Inputs */
+                [unskew2]"r"(unskew2),
+                  [skew2]"r"(skew2)
+        );
+
+        /* Inner loop */
+        for (; remain >= 256/destBPP; remain -= 256/destBPP) {
+            __asm__ volatile (
+                    /* Load source word (if necessary) and shuffle */
+                    "cmp     %[unskew], %[src_bits]            \n\t"
+                    "b.hs    1f                                \n\t"
+                    "ushr    d0, d4, #32                       \n\t"
+                    "ld1r    {v4.2s}, [%[s]], #4               \n\t"
+                    "ushl    v4.2s, v4.2s, v5.2s               \n\t"
+                    "orr     v4.8b, v4.8b, v0.8b               \n\t"
+                    "b       2f                                \n\t"
+                    "1:                                        \n\t"
+                    "mul     v4.2s, v6.2s, v4.s[1]             \n\t"
+                    "2:                                        \n\t"
+
+                    /* Process 8 pixels */
+                    "dup     v2.8b, v4.b[3]                    \n\t"
+                    "dup     v3.8b, v4.b[2]                    \n\t"
+                    "ext     v3.8b, v2.8b, v3.8b, #4           \n\t"
+                    "ushl    v3.8b, v3.8b, v20.8b              \n\t"
+                    "and     v3.8b, v3.8b, v21.8b              \n\t"
+                    "tbl     v0.8b, {v16.16b}, v3.8b           \n\t"
+                    "tbl     v1.8b, {v17.16b}, v3.8b           \n\t"
+                    "tbl     v2.8b, {v18.16b}, v3.8b           \n\t"
+                    "tbl     v3.8b, {v19.16b}, v3.8b           \n\t"
+                    "st4     {v0.8b-v3.8b}, [%[dest]], #32     \n\t"
+            : /* Outputs */
+                       [s]"+r"(s),
+                    [dest]"+r"(dest)
+            : /* Inputs */
+                      [unskew]"r"(unskew),
+                    [src_bits]"I"(src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+            unskew = (unskew - src_bits) & 31;
+        }
+
+        /* Handle trailing pixel, if any */
+        if (trailing_dest_bits) {
+            if (unskew == 0)
+                data = *s++;
+            else
+                __asm__ volatile (
+                        "mov     %w[data], v4.s[1]             \n\t"
+                : /* Outputs */
+                        [data]"=r"(data)
+                );
+            *dest++ = (*cmLookupTable)[data >> 30];
+        }
+
+        src += srcPitch;
+        dest += destPitch - width;
+    } while (--height > 0);
+}
+
+/******************************************************************************/
+
+static void fastPathSourceWord_4_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    const unsigned srcBPP = 4;
+    const unsigned destBPP = 32;
+    uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+
+    uint32_t *clut = *cmLookupTable;
+    __asm__ volatile (
+            "ld4     {v16.16b-v19.16b}, [%[clut]]              \n\t"
+            "ldr     s20, =0x10101010                          \n\t"
+            "ldr     d21, =0x00FC00FC00FC00FC                  \n\t"
+    : /* Outputs */
+            [clut]"+r"(clut)
+    );
+
+    do {
+        uint32_t *s = src;
+        /* Aim to end the inner loop at a 64-bit boundary in destination image. */
+        uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63;
+        uint32_t remain = width - trailing_dest_bits / destBPP;
+        /* The inner loop works in chunks of 256 bits at the destination image. */
+        /* Find the number of pixels that precede the first iteration of the inner loop. */
+        uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP;
+        /* We define skew as the number of bits into the first or only source image
+         * word (counting from the MS bit) corresponding to the first pixel of an
+         * iteration of the inner loop. Where the ratio between source and destination
+         * depths is greater than 8, this number will change between iterations. */
+        uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31;
+        uint32_t unskew = (-skew) & 31;
+
+        /* Handle leading pixels, if any */
+        uint32_t data, carry;
+        if (leading_src_bits == 0) {
+            if (skew == 0)
+                carry = 0;
+            else
+                carry = *s++ << skew;
+            __asm__ volatile (
+                    "mov     v4.s[1], %w[carry]                \n\t"
+            : /* Outputs */
+            : /* Inputs */
+                    [carry]"r"(carry)
+            );
+        } else {
+            if (skew == 0) {
+                data = *s++;
+                carry = 0;
+            }
+            else if (leading_src_bits > skew) {
+                data = *s++ << skew;
+                carry = *s++;
+                data |= carry >> unskew;
+                carry <<= skew;
+            } else {
+                carry = *s++;
+                data = carry >> unskew;
+                carry <<= skew;
+            }
+            data <<= 32 - leading_src_bits;
+
+            __asm__ volatile (
+                    "mov     v4.s[0], %w[data]                 \n\t"
+                    "mov     v4.s[1], %w[carry]                \n\t"
+
+                    /* Process 1..7 pixels */
+                    "umull   v3.8h, v4.8b, v20.8b              \n\t"
+                    "ushl    v3.8b, v3.8b, v21.8b              \n\t"
+                    "rev64   v3.8b, v3.8b                      \n\t"
+                    "tbl     v0.8b, {v16.16b}, v3.8b           \n\t"
+                    "tbl     v1.8b, {v17.16b}, v3.8b           \n\t"
+                    "tbl     v2.8b, {v18.16b}, v3.8b           \n\t"
+                    "tbl     v3.8b, {v19.16b}, v3.8b           \n\t"
+                    "zip1    v24.8b, v0.8b, v2.8b              \n\t"
+                    "zip2    v26.8b, v0.8b, v2.8b              \n\t"
+                    "zip1    v25.8b, v1.8b, v3.8b              \n\t"
+                    "zip2    v27.8b, v1.8b, v3.8b              \n\t"
+                    "zip1    v0.8b, v24.8b, v25.8b             \n\t"
+                    "zip2    v1.8b, v24.8b, v25.8b             \n\t"
+                    "zip1    v2.8b, v26.8b, v27.8b             \n\t"
+                    "zip2    v3.8b, v26.8b, v27.8b             \n\t"
+                    "tst     %[bits], #16                      \n\t"
+                    "b.eq    1f                                \n\t"
+                    "st1     {v0.8b-v1.8b}, [%[dest]], #16     \n\t"
+                    "mov     v0.8b, v2.8b                      \n\t"
+                    "mov     v1.8b, v3.8b                      \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #8                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     d0, [%[dest]], #8                 \n\t"
+                    "mov     v0.8b, v1.8b                      \n\t"
+                    "1:                                        \n\t"
+                    "tst     %[bits], #4                       \n\t"
+                    "b.eq    1f                                \n\t"
+                    "str     s0, [%[dest]], #4                 \n\t"
+                    "1:                                        \n\t"
+            : /* Outputs */
+                     [dest]"+r"(dest)
+            : /* Inputs */
+                     [data]"r"(data),
+                    [carry]"r"(carry),
+                     [bits]"r"(leading_src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+        }
+
+        uint32_t src_bits = srcBPP*256/destBPP;
+        uint32_t unskew2 = -(unskew & (src_bits - 1));
+        uint32_t skew2 = src_bits + unskew2;
+        __asm__ volatile (
+                "mov     v5.s[0], %w[unskew2]                  \n\t"
+                "mov     v5.s[1], %w[skew2]                    \n\t"
+        : /* Outputs */
+        : /* Inputs */
+                [unskew2]"r"(unskew2),
+                  [skew2]"r"(skew2)
+        );
+
+        /* Inner loop */
+        for (; remain >= 256/destBPP; remain -= 256/destBPP) {
+            __asm__ volatile (
+                    /* Load source word and shuffle */
+                    "ushr    d0, d4, #32                       \n\t"
+                    "ld1r    {v4.2s}, [%[s]], #4               \n\t"
+                    "ushl    v4.2s, v4.2s, v5.2s               \n\t"
+                    "orr     v4.8b, v4.8b, v0.8b               \n\t"
+
+                    /* Process 8 pixels */
+                    "umull   v3.8h, v4.8b, v20.8b              \n\t"
+                    "ushl    v3.8b, v3.8b, v21.8b              \n\t"
+                    "rev64   v3.8b, v3.8b                      \n\t"
+                    "tbl     v0.8b, {v16.16b}, v3.8b           \n\t"
+                    "tbl     v1.8b, {v17.16b}, v3.8b           \n\t"
+                    "tbl     v2.8b, {v18.16b}, v3.8b           \n\t"
+                    "tbl     v3.8b, {v19.16b}, v3.8b           \n\t"
+                    "st4     {v0.8b-v3.8b}, [%[dest]], #32     \n\t"
+            : /* Outputs */
+                       [s]"+r"(s),
+                    [dest]"+r"(dest)
+            : /* Inputs */
+                      [unskew]"r"(unskew)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+        }
+
+        /* Handle trailing pixel, if any */
+        if (trailing_dest_bits) {
+            if (unskew == 0)
+                data = *s++;
+            else
+                __asm__ volatile (
+                        "mov     %w[data], v4.s[1]             \n\t"
+                : /* Outputs */
+                        [data]"=r"(data)
+                );
+            *dest++ = (*cmLookupTable)[data >> 28];
+        }
+
+        src += srcPitch;
+        dest += destPitch - width;
+    } while (--height > 0);
+}
+
+/******************************************************************************/
+
+static void fastPathSourceWord_16_32(operation_t *op, uint32_t flags)
+{
+    IGNORE(flags);
+    COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    const unsigned srcBPP = 16;
+    const unsigned destBPP = 32;
+    uint32_t *src = srcBits + srcPitch * srcY + srcX * srcBPP / 32;
+    uint32_t *dest = destBits + destPitch * destY + destX;
+
+    __asm__ volatile (
+            "ldr     q16, =0x00F800F800F800F800F800F800F800F8  \n\t"
+            "ldr     q17, =0xF800F800F800F800F800F800F800F800  \n\t"
+            "ldr     q18, =0x80008000800080008000800080008000  \n\t"
+            "ldr     q19, =0x00010001000100010001000100010001  \n\t"
+    );
+
+    do {
+        uint32_t *s = src;
+        /* Aim to end the inner loop at a 64-bit boundary in destination image. */
+        uint32_t trailing_dest_bits = ((uintptr_t) dest * 8 + width * destBPP) & 63;
+        uint32_t remain = width - trailing_dest_bits / destBPP;
+        /* The inner loop works in chunks of 256 bits at the destination image. */
+        /* Find the number of pixels that precede the first iteration of the inner loop. */
+        uint32_t leading_src_bits = (remain & (256/destBPP-1)) * srcBPP;
+        /* We define skew as the number of bits into the first or only source image
+         * word (counting from the MS bit) corresponding to the first pixel of an
+         * iteration of the inner loop. Where the ratio between source and destination
+         * depths is greater than 8, this number will change between iterations. */
+        uint32_t skew = (srcX * srcBPP + leading_src_bits) & 31;
+
+        if (skew == 0) {
+            __asm__ volatile (
+                    /* Load leading pixels */
+                    "cbz     %w[bits], 10f                     \n\t"
+                    "tbz     %w[bits], #4, 1f                  \n\t"
+                    "ld1     {v4.s}[0], [%[s]], #4             \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #5, 1f                  \n\t"
+                    "ld1     {v4.s}[1], [%[s]], #4             \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #6, 1f                  \n\t"
+                    "ld1     {v4.d}[1], [%[s]], #8             \n\t"
+                    "1:                                        \n\t"
+                    "rev32   v4.8h, v4.8h                      \n\t"
+
+                    /* Process 1..7 pixels */
+                    "shl     v0.8h, v4.8h, #3                  \n\t"
+                    "shl     v1.8h, v4.8h, #6                  \n\t"
+                    "ushr    v2.8h, v4.8h, #7                  \n\t"
+                    "cmeq    v3.8h, v4.8h, v18.8h              \n\t"
+                    "and     v0.16b, v0.16b, v16.16b           \n\t"
+                    "and     v1.16b, v1.16b, v17.16b           \n\t"
+                    "bit     v0.16b, v19.16b, v3.16b           \n\t"
+                    "and     v2.16b, v2.16b, v16.16b           \n\t"
+                    "orr     v3.16b, v0.16b, v1.16b            \n\t"
+                    "zip1    v0.8h, v3.8h, v2.8h               \n\t"
+                    "zip2    v1.8h, v3.8h, v2.8h               \n\t"
+
+                    /* Store leading pixels */
+                    "tbz     %w[bits], #4, 1f                  \n\t"
+                    "st1     {v0.s}[1], [%[dest]], #4          \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #5, 1f                  \n\t"
+                    "st1     {v0.d}[1], [%[dest]], #8          \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #6, 10f                 \n\t"
+                    "st1     {v1.16b}, [%[dest]], #16          \n\t"
+                    "10:                                       \n\t"
+            : /* Outputs */
+                       [s]"+r"(s),
+                    [dest]"+r"(dest)
+            : /* Inputs */
+                    [bits]"r"(leading_src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+
+            /* Inner loop */
+            for (; remain >= 256/destBPP; remain -= 256/destBPP) {
+                __asm__ volatile (
+                        /* Load pixels */
+                        "ld1     {v4.16b}, [%[s]], #16         \n\t"
+                        "rev32   v4.8h, v4.8h                  \n\t"
+
+                        /* Process 8 pixels */
+                        "shl     v0.8h, v4.8h, #3              \n\t"
+                        "shl     v1.8h, v4.8h, #6              \n\t"
+                        "ushr    v2.8h, v4.8h, #7              \n\t"
+                        "cmeq    v3.8h, v4.8h, v18.8h          \n\t"
+                        "and     v0.16b, v0.16b, v16.16b       \n\t"
+                        "and     v1.16b, v1.16b, v17.16b       \n\t"
+                        "bit     v0.16b, v19.16b, v3.16b       \n\t"
+                        "and     v2.16b, v2.16b, v16.16b       \n\t"
+                        "orr     v3.16b, v0.16b, v1.16b        \n\t"
+                        "zip1    v0.8h, v3.8h, v2.8h           \n\t"
+                        "zip2    v1.8h, v3.8h, v2.8h           \n\t"
+
+                        /* Store pixels */
+                        "st1     {v0.16b-v1.16b}, [%[dest]], #32 \n\t"
+                : /* Outputs */
+                           [s]"+r"(s),
+                        [dest]"+r"(dest)
+                : /* Inputs */
+                : /* Clobbers */
+                        "memory", "cc"
+                );
+            }
+
+            /* Handle trailing pixel, if any */
+            if (trailing_dest_bits) {
+                uint32_t in = *s;
+                uint32_t r = (in >> 7)  & 0xF80000;
+                uint32_t g = (in >> 10) & 0xF800;
+                uint32_t b = (in >> 13) & 0xF8;
+                *dest++ = r | g | b | ((in >> 16) == 0x8000);
+            }
+
+        } else {
+            __asm__ volatile (
+                    /* Load leading pixels */
+                    "ld1r    {v4.4s}, [%[s]], #4               \n\t"
+                    "tbz     %w[bits], #5, 1f                  \n\t"
+                    "ld1r    {v5.4s}, [%[s]], #4               \n\t"
+                    "ext     v4.16b, v4.16b, v5.16b, #12       \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #6, 1f                  \n\t"
+                    "ld1     {v4.d}[1], [%[s]], #8             \n\t"
+                    "1:                                        \n\t"
+                    "rev32   v5.8h, v4.8h                      \n\t"
+                    "ext     v4.16b, v5.16b, v5.16b, #14       \n\t"
+                    "cbz      %w[bits], 10f                    \n\t"
+
+                    /* Process 1..7 pixels */
+                    "shl     v0.8h, v4.8h, #3                  \n\t"
+                    "shl     v1.8h, v4.8h, #6                  \n\t"
+                    "ushr    v2.8h, v4.8h, #7                  \n\t"
+                    "cmeq    v3.8h, v4.8h, v18.8h              \n\t"
+                    "and     v0.16b, v0.16b, v16.16b           \n\t"
+                    "and     v1.16b, v1.16b, v17.16b           \n\t"
+                    "bit     v0.16b, v19.16b, v3.16b           \n\t"
+                    "and     v2.16b, v2.16b, v16.16b           \n\t"
+                    "orr     v3.16b, v0.16b, v1.16b            \n\t"
+                    "zip1    v0.8h, v3.8h, v2.8h               \n\t"
+                    "zip2    v1.8h, v3.8h, v2.8h               \n\t"
+
+                    /* Store leading pixels */
+                    "tbz     %w[bits], #4, 1f                  \n\t"
+                    "st1     {v0.s}[1], [%[dest]], #4          \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #5, 1f                  \n\t"
+                    "st1     {v0.d}[1], [%[dest]], #8          \n\t"
+                    "1:                                        \n\t"
+                    "tbz     %w[bits], #6, 10f                 \n\t"
+                    "st1     {v1.16b}, [%[dest]], #16          \n\t"
+                    "10:                                       \n\t"
+            : /* Outputs */
+                       [s]"+r"(s),
+                    [dest]"+r"(dest)
+            : /* Inputs */
+                    [skew]"r"(skew),
+                    [bits]"r"(leading_src_bits)
+            : /* Clobbers */
+                    "memory", "cc"
+            );
+
+            /* Inner loop */
+            for (; remain >= 256/destBPP; remain -= 256/destBPP) {
+                __asm__ volatile (
+                        /* Load pixels */
+                        "mov     v4.16b, v5.16b                \n\t"
+                        "ld1     {v5.16b}, [%[s]], #16         \n\t"
+                        "rev32   v5.8h, v5.8h                  \n\t"
+                        "ext     v4.16b, v4.16b, v5.16b, #14   \n\t"
+
+                        /* Process 8 pixels */
+                        "shl     v0.8h, v4.8h, #3              \n\t"
+                        "shl     v1.8h, v4.8h, #6              \n\t"
+                        "ushr    v2.8h, v4.8h, #7              \n\t"
+                        "cmeq    v3.8h, v4.8h, v18.8h          \n\t"
+                        "and     v0.16b, v0.16b, v16.16b       \n\t"
+                        "and     v1.16b, v1.16b, v17.16b       \n\t"
+                        "bit     v0.16b, v19.16b, v3.16b       \n\t"
+                        "and     v2.16b, v2.16b, v16.16b       \n\t"
+                        "orr     v3.16b, v0.16b, v1.16b        \n\t"
+                        "zip1    v0.8h, v3.8h, v2.8h           \n\t"
+                        "zip2    v1.8h, v3.8h, v2.8h           \n\t"
+
+                        /* Store pixels */
+                        "st1     {v0.16b-v1.16b}, [%[dest]], #32 \n\t"
+                : /* Outputs */
+                           [s]"+r"(s),
+                        [dest]"+r"(dest)
+                : /* Inputs */
+                : /* Clobbers */
+                        "memory", "cc"
+                );
+            }
+
+            /* Handle trailing pixel, if any */
+            if (trailing_dest_bits) {
+                uint32_t in;
+                __asm__ volatile (
+                        "mov     %w[in], v5.s[3]               \n\t"
+                : /* Outputs */
+                        [in]"=r"(in)
+                );
+                uint32_t r = (in >> 7)  & 0xF80000;
+                uint32_t g = (in >> 10) & 0xF800;
+                uint32_t b = (in >> 13) & 0xF8;
+                *dest++ = r | g | b | ((in >> 16) == 0x8000);
+            }
+
+        }
+
+        src += srcPitch;
+        dest += destPitch - width;
+    } while (--height > 0);
+}
+
+/******************************************************************************/
+
+#define AlphaBlend_scalar_0_32_DECLARE_TEMPORARIES
+
+#define AlphaBlend_scalar_0_32_PASS_TEMPORARIES
+
+#define AlphaBlend_scalar_0_32_INIT                                  \
+                "ld4r    {v20.16b-v23.16b}, [%[halftone]]      \n\t" \
+                "movi    v18.16b, #0xFF                        \n\t" \
+                "mvn     v19.16b, v23.16b                      \n\t" \
+                "umull   v20.8h, v20.8b, v23.8b                \n\t" \
+                "umull   v21.8h, v21.8b, v23.8b                \n\t" \
+                "umull   v22.8h, v22.8b, v23.8b                \n\t" \
+                "umull   v23.8h, v18.8b, v23.8b                \n\t" \
+
+#define AlphaBlend_scalar_0_32_32                                    \
+                "ld4     {v4.b-v7.b}[0], [%[dest]]             \n\t" \
+                AlphaBlend_scalar_0_32_256                           \
+                "st4     {v0.b-v3.b}[0], [%[dest]], #4         \n\t" \
+
+#define AlphaBlend_scalar_0_32_256                                   \
+                "umull   v24.8h, v4.8b,  v19.8b                \n\t" \
+                "umull   v25.8h, v5.8b,  v19.8b                \n\t" \
+                "umull   v26.8h, v6.8b,  v19.8b                \n\t" \
+                "umull   v27.8h, v7.8b,  v19.8b                \n\t" \
+                "add     v24.8h, v24.8h, v20.8h                \n\t" \
+                "add     v25.8h, v25.8h, v21.8h                \n\t" \
+                "add     v26.8h, v26.8h, v22.8h                \n\t" \
+                "add     v27.8h, v27.8h, v23.8h                \n\t" \
+                "cmtst   v0.8h, v24.8h, v18.8h                 \n\t" \
+                "cmtst   v1.8h, v25.8h, v18.8h                 \n\t" \
+                "cmtst   v2.8h, v26.8h, v18.8h                 \n\t" \
+                "cmtst   v3.8h, v27.8h, v18.8h                 \n\t" \
+                "usra    v24.8h, v24.8h, #8                    \n\t" \
+                "usra    v25.8h, v25.8h, #8                    \n\t" \
+                "usra    v26.8h, v26.8h, #8                    \n\t" \
+                "usra    v27.8h, v27.8h, #8                    \n\t" \
+                "sub     v0.16b, v24.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v25.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v26.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v27.16b, v3.16b               \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "shrn    v1.8b, v1.8h, #8                      \n\t" \
+                "shrn    v2.8b, v2.8h, #8                      \n\t" \
+                "shrn    v3.8b, v3.8h, #8                      \n\t" \
+
+#define AlphaBlend_scalar_0_32_512                                   \
+                "umull   v24.8h, v4.8b,  v19.8b                \n\t" \
+                "umull2  v25.8h, v4.16b, v19.16b               \n\t" \
+                "umull   v26.8h, v5.8b,  v19.8b                \n\t" \
+                "umull2  v27.8h, v5.16b, v19.16b               \n\t" \
+                "umull   v28.8h, v6.8b,  v19.8b                \n\t" \
+                "umull2  v29.8h, v6.16b, v19.16b               \n\t" \
+                "umull   v30.8h, v7.8b,  v19.8b                \n\t" \
+                "umull2  v31.8h, v7.16b, v19.16b               \n\t" \
+                "add     v24.8h, v24.8h, v20.8h                \n\t" \
+                "add     v25.8h, v25.8h, v20.8h                \n\t" \
+                "add     v26.8h, v26.8h, v21.8h                \n\t" \
+                "add     v27.8h, v27.8h, v21.8h                \n\t" \
+                "add     v28.8h, v28.8h, v22.8h                \n\t" \
+                "add     v29.8h, v29.8h, v22.8h                \n\t" \
+                "add     v30.8h, v30.8h, v23.8h                \n\t" \
+                "add     v31.8h, v31.8h, v23.8h                \n\t" \
+                "cmtst   v0.8h, v24.8h, v18.8h                 \n\t" \
+                "cmtst   v1.8h, v25.8h, v18.8h                 \n\t" \
+                "cmtst   v2.8h, v26.8h, v18.8h                 \n\t" \
+                "cmtst   v3.8h, v27.8h, v18.8h                 \n\t" \
+                "cmtst   v4.8h, v28.8h, v18.8h                 \n\t" \
+                "cmtst   v5.8h, v29.8h, v18.8h                 \n\t" \
+                "cmtst   v6.8h, v30.8h, v18.8h                 \n\t" \
+                "cmtst   v7.8h, v31.8h, v18.8h                 \n\t" \
+                "usra    v24.8h, v24.8h, #8                    \n\t" \
+                "usra    v25.8h, v25.8h, #8                    \n\t" \
+                "usra    v26.8h, v26.8h, #8                    \n\t" \
+                "usra    v27.8h, v27.8h, #8                    \n\t" \
+                "usra    v28.8h, v28.8h, #8                    \n\t" \
+                "usra    v29.8h, v29.8h, #8                    \n\t" \
+                "usra    v30.8h, v30.8h, #8                    \n\t" \
+                "usra    v31.8h, v31.8h, #8                    \n\t" \
+                "sub     v0.16b, v24.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v25.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v26.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v27.16b, v3.16b               \n\t" \
+                "sub     v4.16b, v28.16b, v4.16b               \n\t" \
+                "sub     v5.16b, v29.16b, v5.16b               \n\t" \
+                "sub     v6.16b, v30.16b, v6.16b               \n\t" \
+                "sub     v7.16b, v31.16b, v7.16b               \n\t" \
+                "shrn    v0.8b,  v0.8h, #8                     \n\t" \
+                "shrn    v24.8b, v1.8h, #8                     \n\t" \
+                "shrn    v1.8b,  v2.8h, #8                     \n\t" \
+                "shrn    v25.8b, v3.8h, #8                     \n\t" \
+                "shrn    v2.8b,  v4.8h, #8                     \n\t" \
+                "shrn    v26.8b, v5.8h, #8                     \n\t" \
+                "shrn    v3.8b,  v6.8h, #8                     \n\t" \
+                "shrn    v27.8b, v7.8h, #8                     \n\t" \
+                "mov     v0.d[1], v24.d[0]                     \n\t" \
+                "mov     v1.d[1], v25.d[0]                     \n\t" \
+                "mov     v2.d[1], v26.d[0]                     \n\t" \
+                "mov     v3.d[1], v27.d[0]                     \n\t" \
+
+DEFINE_FAST_PATH(AlphaBlend_scalar, 0_32, YES, YES)
+
+/******************************************************************************/
+
+#define AlphaBlend_map1_scalar_32_32_DECLARE_STACK uint64_t save[3]
+
+#define AlphaBlend_map1_scalar_32_32_PASS_STACK , [save]"r"(save)
+
+#define AlphaBlend_map1_scalar_32_32_DECLARE_TEMPORARIES
+
+#define AlphaBlend_map1_scalar_32_32_PASS_TEMPORARIES
+
+#define AlphaBlend_map1_scalar_32_32_INIT                            \
+                "str     d13, [%[save], #0]                    \n\t" \
+                "stp     d14, d15, [%[save], #8]               \n\t" \
+                "ld4r    {v24.16b-v27.16b}, [%[clut]], #4      \n\t" \
+                "ld4r    {v28.16b-v31.16b}, [%[clut]]          \n\t" \
+                "ld4r    {v16.16b-v19.16b}, [%[halftone]]      \n\t" \
+                "movi    v13.16b, #0xFF                        \n\t" \
+                "and     v24.8b,  v24.8b,  v16.8b              \n\t" \
+                "and     v25.8b,  v25.8b,  v17.8b              \n\t" \
+                "and     v26.8b,  v26.8b,  v18.8b              \n\t" \
+                "and     v27.16b, v27.16b, v19.16b             \n\t" \
+                "and     v28.8b,  v28.8b,  v16.8b              \n\t" \
+                "and     v29.8b,  v29.8b,  v17.8b              \n\t" \
+                "and     v30.8b,  v30.8b,  v18.8b              \n\t" \
+                "and     v31.16b, v31.16b, v19.16b             \n\t" \
+                "mvn     v14.16b, v27.16b                      \n\t" \
+                "mvn     v15.16b, v31.16b                      \n\t" \
+                "umull   v24.8h, v24.8b, v27.8b                \n\t" \
+                "umull   v25.8h, v25.8b, v27.8b                \n\t" \
+                "umull   v26.8h, v26.8b, v27.8b                \n\t" \
+                "umull   v27.8h, v13.8b, v27.8b                \n\t" \
+                "umull   v28.8h, v28.8b, v31.8b                \n\t" \
+                "umull   v29.8h, v29.8b, v31.8b                \n\t" \
+                "umull   v30.8h, v30.8b, v31.8b                \n\t" \
+                "umull   v31.8h, v13.8b, v31.8b                \n\t" \
+
+#define AlphaBlend_map1_scalar_32_32_FINAL                           \
+                "ldr     d13, [%[save], #0]                    \n\t" \
+                "ldp     d14, d15, [%[save], #8]               \n\t" \
+
+#define AlphaBlend_map1_scalar_32_32_32                              \
+                "ldr     s1, [%[src]], #4                      \n\t" \
+                "ld4     {v4.b-v7.b}[0], [%[dest]]             \n\t" \
+                "cmtst   v0.2s,  v1.2s, v13.2s                 \n\t" \
+                "cmtst   v16.2s, v1.2s, v13.2s                 \n\t" \
+                "cmtst   v17.2s, v1.2s, v13.2s                 \n\t" \
+                "cmtst   v18.2s, v1.2s, v13.2s                 \n\t" \
+                "cmtst   v19.2s, v1.2s, v13.2s                 \n\t" \
+                "bsl     v0.8b, v15.8b, v14.8b                 \n\t" \
+                "bsl     v16.8b, v28.8b, v24.8b                \n\t" \
+                "bsl     v17.8b, v29.8b, v25.8b                \n\t" \
+                "bsl     v18.8b, v30.8b, v26.8b                \n\t" \
+                "bsl     v19.8b, v31.8b, v27.8b                \n\t" \
+                "umlal   v16.8h, v4.8b,  v0.8b                 \n\t" \
+                "umlal   v17.8h, v5.8b,  v0.8b                 \n\t" \
+                "umlal   v18.8h, v6.8b,  v0.8b                 \n\t" \
+                "umlal   v19.8h, v7.8b,  v0.8b                 \n\t" \
+                "cmtst   v0.4h, v16.4h, v13.4h                 \n\t" \
+                "cmtst   v1.4h, v17.4h, v13.4h                 \n\t" \
+                "cmtst   v2.4h, v18.4h, v13.4h                 \n\t" \
+                "cmtst   v3.4h, v19.4h, v13.4h                 \n\t" \
+                "usra    v16.4h, v16.4h, #8                    \n\t" \
+                "usra    v17.4h, v17.4h, #8                    \n\t" \
+                "usra    v18.4h, v18.4h, #8                    \n\t" \
+                "usra    v19.4h, v19.4h, #8                    \n\t" \
+                "sub     v0.8b, v16.8b, v0.8b                  \n\t" \
+                "sub     v1.8b, v17.8b, v1.8b                  \n\t" \
+                "sub     v2.8b, v18.8b, v2.8b                  \n\t" \
+                "sub     v3.8b, v19.8b, v3.8b                  \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "shrn    v1.8b, v1.8h, #8                      \n\t" \
+                "shrn    v2.8b, v2.8h, #8                      \n\t" \
+                "shrn    v3.8b, v3.8h, #8                      \n\t" \
+                "st4     {v0.b-v3.b}[0], [%[dest]], #4         \n\t" \
+
+#define AlphaBlend_map1_scalar_32_32_256                             \
+                "orr     v0.8b, v0.8b, v1.8b                   \n\t" \
+                "orr     v2.8b, v2.8b, v3.8b                   \n\t" \
+                "orr     v0.8b, v0.8b, v2.8b                   \n\t" \
+                "cmtst   v0.8b, v0.8b, v13.8b                  \n\t" \
+                "sxtl    v16.8h, v0.8b                         \n\t" \
+                "sxtl    v17.8h, v0.8b                         \n\t" \
+                "sxtl    v18.8h, v0.8b                         \n\t" \
+                "sxtl    v19.8h, v0.8b                         \n\t" \
+                "bsl     v0.8b, v15.8b, v14.8b                 \n\t" \
+                "bsl     v16.16b, v28.16b, v24.16b             \n\t" \
+                "bsl     v17.16b, v29.16b, v25.16b             \n\t" \
+                "bsl     v18.16b, v30.16b, v26.16b             \n\t" \
+                "bsl     v19.16b, v31.16b, v27.16b             \n\t" \
+                "umlal   v16.8h, v4.8b, v0.8b                  \n\t" \
+                "umlal   v17.8h, v5.8b, v0.8b                  \n\t" \
+                "umlal   v18.8h, v6.8b, v0.8b                  \n\t" \
+                "umlal   v19.8h, v7.8b, v0.8b                  \n\t" \
+                "cmtst   v0.8h, v16.8h, v13.8h                 \n\t" \
+                "cmtst   v1.8h, v17.8h, v13.8h                 \n\t" \
+                "cmtst   v2.8h, v18.8h, v13.8h                 \n\t" \
+                "cmtst   v3.8h, v19.8h, v13.8h                 \n\t" \
+                "usra    v16.8h, v16.8h, #8                    \n\t" \
+                "usra    v17.8h, v17.8h, #8                    \n\t" \
+                "usra    v18.8h, v18.8h, #8                    \n\t" \
+                "usra    v19.8h, v19.8h, #8                    \n\t" \
+                "sub     v0.16b, v16.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v17.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v18.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v19.16b, v3.16b               \n\t" \
+                "shrn    v0.8b, v0.8h, #8                      \n\t" \
+                "shrn    v1.8b, v1.8h, #8                      \n\t" \
+                "shrn    v2.8b, v2.8h, #8                      \n\t" \
+                "shrn    v3.8b, v3.8h, #8                      \n\t" \
+
+#define AlphaBlend_map1_scalar_32_32_512                             \
+                "orr     v0.16b, v0.16b, v1.16b                \n\t" \
+                "orr     v2.16b, v2.16b, v3.16b                \n\t" \
+                "orr     v0.16b, v0.16b, v2.16b                \n\t" \
+                "cmtst   v0.16b, v0.16b, v13.16b               \n\t" \
+                "sxtl    v16.8h, v0.8b                         \n\t" \
+                "sxtl2   v17.8h, v0.16b                        \n\t" \
+                "sxtl    v18.8h, v0.8b                         \n\t" \
+                "sxtl2   v19.8h, v0.16b                        \n\t" \
+                "sxtl    v20.8h, v0.8b                         \n\t" \
+                "sxtl2   v21.8h, v0.16b                        \n\t" \
+                "sxtl    v22.8h, v0.8b                         \n\t" \
+                "sxtl2   v23.8h, v0.16b                        \n\t" \
+                "bsl     v0.16b, v15.16b, v14.16b              \n\t" \
+                "bsl     v16.16b, v28.16b, v24.16b             \n\t" \
+                "bsl     v17.16b, v28.16b, v24.16b             \n\t" \
+                "bsl     v18.16b, v29.16b, v25.16b             \n\t" \
+                "bsl     v19.16b, v29.16b, v25.16b             \n\t" \
+                "bsl     v20.16b, v30.16b, v26.16b             \n\t" \
+                "bsl     v21.16b, v30.16b, v26.16b             \n\t" \
+                "bsl     v22.16b, v31.16b, v27.16b             \n\t" \
+                "bsl     v23.16b, v31.16b, v27.16b             \n\t" \
+                "umlal   v16.8h, v4.8b,  v0.8b                 \n\t" \
+                "umlal2  v17.8h, v4.16b, v0.16b                \n\t" \
+                "umlal   v18.8h, v5.8b,  v0.8b                 \n\t" \
+                "umlal2  v19.8h, v5.16b, v0.16b                \n\t" \
+                "umlal   v20.8h, v6.8b,  v0.8b                 \n\t" \
+                "umlal2  v21.8h, v6.16b, v0.16b                \n\t" \
+                "umlal   v22.8h, v7.8b,  v0.8b                 \n\t" \
+                "umlal2  v23.8h, v7.16b, v0.16b                \n\t" \
+                "cmtst   v0.8h, v16.8h, v13.8h                 \n\t" \
+                "cmtst   v1.8h, v17.8h, v13.8h                 \n\t" \
+                "cmtst   v2.8h, v18.8h, v13.8h                 \n\t" \
+                "cmtst   v3.8h, v19.8h, v13.8h                 \n\t" \
+                "cmtst   v4.8h, v20.8h, v13.8h                 \n\t" \
+                "cmtst   v5.8h, v21.8h, v13.8h                 \n\t" \
+                "cmtst   v6.8h, v22.8h, v13.8h                 \n\t" \
+                "cmtst   v7.8h, v23.8h, v13.8h                 \n\t" \
+                "usra    v16.8h, v16.8h, #8                    \n\t" \
+                "usra    v17.8h, v17.8h, #8                    \n\t" \
+                "usra    v18.8h, v18.8h, #8                    \n\t" \
+                "usra    v19.8h, v19.8h, #8                    \n\t" \
+                "usra    v20.8h, v20.8h, #8                    \n\t" \
+                "usra    v21.8h, v21.8h, #8                    \n\t" \
+                "usra    v22.8h, v22.8h, #8                    \n\t" \
+                "usra    v23.8h, v23.8h, #8                    \n\t" \
+                "sub     v0.16b, v16.16b, v0.16b               \n\t" \
+                "sub     v1.16b, v17.16b, v1.16b               \n\t" \
+                "sub     v2.16b, v18.16b, v2.16b               \n\t" \
+                "sub     v3.16b, v19.16b, v3.16b               \n\t" \
+                "sub     v4.16b, v20.16b, v4.16b               \n\t" \
+                "sub     v5.16b, v21.16b, v5.16b               \n\t" \
+                "sub     v6.16b, v22.16b, v6.16b               \n\t" \
+                "sub     v7.16b, v23.16b, v7.16b               \n\t" \
+                "shrn    v0.8b,  v0.8h, #8                     \n\t" \
+                "shrn    v16.8b, v1.8h, #8                     \n\t" \
+                "shrn    v1.8b,  v2.8h, #8                     \n\t" \
+                "shrn    v17.8b, v3.8h, #8                     \n\t" \
+                "shrn    v2.8b,  v4.8h, #8                     \n\t" \
+                "shrn    v18.8b, v5.8h, #8                     \n\t" \
+                "shrn    v3.8b,  v6.8h, #8                     \n\t" \
+                "shrn    v19.8b, v7.8h, #8                     \n\t" \
+                "mov     v0.d[1], v16.d[0]                     \n\t" \
+                "mov     v1.d[1], v17.d[0]                     \n\t" \
+                "mov     v2.d[1], v18.d[0]                     \n\t" \
+                "mov     v3.d[1], v19.d[0]                     \n\t" \
+
+DEFINE_FAST_PATH(AlphaBlend_map1_scalar, 32_32, YES, YES)
+
+/******************************************************************************/
+
+static fast_path_t fastPaths[] = {
+        { fastPathAlphaBlend_32_32,             CR_alphaBlend,           STD_FLAGS(32,32,NO,NO) },
+        { fastPathPixPaint_32_32,               CR_pixPaint,             STD_FLAGS(32,32,NO,NO) },
+
+        { fastPathSourceWord_1_32,              CR_sourceWord,           STD_FLAGS(1,32,DIRECT,NO) },
+        { fastPathSourceWord_2_32,              CR_sourceWord,           STD_FLAGS(2,32,DIRECT,NO) },
+        { fastPathSourceWord_4_32,              CR_sourceWord,           STD_FLAGS(4,32,DIRECT,NO) },
+        { fastPathSourceWord_16_32,             CR_sourceWord,           STD_FLAGS(16,32,NO,NO) },
+
+        { fastPathAlphaBlend_scalar_0_32,       CR_alphaBlend,           STD_FLAGS_NO_SOURCE(32,SCALAR) },
+        { fastPathAlphaBlend_map1_scalar_32_32, CR_alphaBlend,           STD_FLAGS(32,32,1BIT,SCALAR) },
+
+        { fastPathBitAnd_32_32,                 CR_bitAnd,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitAndInvert_32_32,           CR_bitAndInvert,         STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertAnd_32_32,           CR_bitInvertAnd,         STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitXor_32_32,                 CR_bitXor,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitOr_32_32,                  CR_bitOr,                STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertAndInvert_32_32,     CR_bitInvertAndInvert,   STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertXor_32_32,           CR_bitInvertXor,         STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertDestination_0_32,    CR_bitInvertDestination, STD_FLAGS_NO_SOURCE(32,NO) },
+        { fastPathBitOrInvert_32_32,            CR_bitOrInvert,          STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertSource_32_32,        CR_bitInvertSource,      STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertOr_32_32,            CR_bitInvertOr,          STD_FLAGS(32,32,NO,NO) },
+        { fastPathBitInvertOrInvert_32_32,      CR_bitInvertOrInvert,    STD_FLAGS(32,32,NO,NO) },
+        { fastPathAddWord_32_32,                CR_addWord,              STD_FLAGS(32,32,NO,NO) },
+        { fastPathSubWord_32_32,                CR_subWord,              STD_FLAGS(32,32,NO,NO) },
+        { fastPathRgbAdd_32_32,                 CR_rgbAdd,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathRgbSub_32_32,                 CR_rgbSub,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathPixMask_32_32,                CR_pixMask,              STD_FLAGS(32,32,NO,NO) },
+        { fastPathRgbMax_32_32,                 CR_rgbMax,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathRgbMin_32_32,                 CR_rgbMin,               STD_FLAGS(32,32,NO,NO) },
+        { fastPathRgbMinInvert_32_32,           CR_rgbMinInvert,         STD_FLAGS(32,32,NO,NO) },
+        { fastPathAlphaBlendConst_32_32,        CR_alphaBlendConst,      STD_FLAGS(32,32,NO,NO) },
+};
+
+void addArm64FastPaths(void)
+{
+    addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+}
diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h
new file mode 100644
index 0000000000..afc73251bc
--- /dev/null
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltArm64.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2021 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#ifndef BITBLTARM64_H_
+#define BITBLTARM64_H_
+
+void addArm64FastPaths(void);
+
+#endif /* BITBLTARM64_H_ */
diff --git a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
index 10a762be43..a2e0aa32ab 100644
--- a/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
+++ b/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
@@ -32,6 +32,7 @@
 
 #include "BitBltDispatch.h"
 #include "BitBltArm.h"
+#include "BitBltArm64.h"
 #include "BitBltGeneric.h"
 #include "BitBltInternal.h"
 
@@ -202,7 +203,9 @@ void initialiseCopyBits(void)
 #ifdef __arm__
 	addArmFastPaths();
 #endif
-
+#ifdef __aarch64__
+	addArm64FastPaths();
+#endif
 }
 
 void addFastPaths(fast_path_t *paths, size_t n)
@@ -315,6 +318,55 @@ void copyBitsDispatch(operation_t *op)
 				 * each pixel has value 0 or not (note that non-zero pixels that
 				 * reduce to 0 when bit-packed use lookup index 1)
 				 */
+#ifdef __aarch64__
+				if (op->cmMask >= 15 && (op->cmMask & (op->cmMask + 1)) == 0) {
+					uint32_t value = (*op->cmLookupTable)[1];
+					uint32_t *ptr = (*op->cmLookupTable) + 2;
+					uint32_t *end = (*op->cmLookupTable) + 16;
+					for (; ptr != end; ++ptr)
+						if (*ptr != value)
+							break;
+					if (ptr == end) {
+						uint64_t result0, result1;
+						end += op->cmMask + 1 - 16;
+						__asm__ volatile (
+								"dup     v16.4s, %w[value]               \n\t"
+								"movi    v4.16b, #0                      \n\t"
+								"movi    v5.16b, #0                      \n\t"
+								"movi    v6.16b, #0                      \n\t"
+								"movi    v7.16b, #0                      \n\t"
+								"1:                                      \n\t"
+								"ld1     {v0.16b-v3.16b}, [%[ptr]], #64  \n\t"
+								"eor     v0.16b, v0.16b, v16.16b         \n\t"
+								"eor     v1.16b, v1.16b, v16.16b         \n\t"
+								"eor     v2.16b, v2.16b, v16.16b         \n\t"
+								"eor     v3.16b, v3.16b, v16.16b         \n\t"
+								"orr     v4.16b, v4.16b, v0.16b          \n\t"
+								"orr     v5.16b, v5.16b, v1.16b          \n\t"
+								"orr     v6.16b, v6.16b, v2.16b          \n\t"
+								"orr     v7.16b, v7.16b, v3.16b          \n\t"
+								"cmp     %[ptr], %[end]                  \n\t"
+								"b.ne    1b                              \n\t"
+								"orr     v4.16b, v4.16b, v5.16b          \n\t"
+								"orr     v6.16b, v6.16b, v7.16b          \n\t"
+								"orr     v4.16b, v4.16b, v6.16b          \n\t"
+								"mov     %[result0], v4.d[0]             \n\t"
+								"mov     %[result1], v4.d[1]             \n\t"
+						: /* Outputs */
+								    [ptr]"+r"(ptr),
+								[result0]"=r"(result0),
+								[result1]"=r"(result1)
+						: /* Inputs */
+								  [value]"r"(value),
+								    [end]"r"(end)
+						: /* Clobbers */
+								"cc"
+						);
+						if (result0 == 0 && result1 == 0)
+							flags |= FAST_PATH_1BIT_COLOR_MAP;
+					}
+				}
+#else
 				usqInt i;
 				flags |= FAST_PATH_1BIT_COLOR_MAP;
 				for (i = op->cmMask; i >= 2; --i) {
@@ -323,6 +375,7 @@ void copyBitsDispatch(operation_t *op)
 						break;
 					}
 				}
+#endif
 				if ((flags & FAST_PATH_1BIT_COLOR_MAP) == 0) {
 					if (op->src.depth == 32) {
 						if (op->cmMask == 0x7FFF && memcmp(op->cmMaskTable, maskTable85, sizeof maskTable85) == 0 && memcmp(op->cmShiftTable, shiftTable85, sizeof shiftTable85) == 0)
diff --git a/platforms/unix/config/configure b/platforms/unix/config/configure
index a2f424d486..4d7d0a5260 100755
--- a/platforms/unix/config/configure
+++ b/platforms/unix/config/configure
@@ -18401,7 +18401,7 @@ aarch64)
 # Check whether --enable-fast-bitblt was given.
 if test "${enable_fast_bitblt+set}" = set; then :
   enableval=$enable_fast_bitblt;  if   test "x$enableval" = "xyes" ; then
-      bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o"
+      bitblt_objs="BitBltPlugin.o BitBltArm64.o BitBltDispatch.o BitBltGeneric.o"
       bitblt_flags="-DENABLE_FAST_BLT"
    fi
 
diff --git a/platforms/unix/plugins/BitBltPlugin/acinclude.m4 b/platforms/unix/plugins/BitBltPlugin/acinclude.m4
index 146b16acee..f7f2e51f2c 100644
--- a/platforms/unix/plugins/BitBltPlugin/acinclude.m4
+++ b/platforms/unix/plugins/BitBltPlugin/acinclude.m4
@@ -22,7 +22,7 @@ aarch64)
 AC_ARG_ENABLE(fast-bitblt,
  [  --enable-fast-bitblt enable fast BitBlt optimizations (default=no)],
  [ if   test "x$enableval" = "xyes" ; then
-      bitblt_objs="BitBltPlugin.o BitBltDispatch.o BitBltGeneric.o"
+      bitblt_objs="BitBltPlugin.o BitBltArm64.o BitBltDispatch.o BitBltGeneric.o"
       bitblt_flags="-DENABLE_FAST_BLT"
    fi
  ],