summaryrefslogtreecommitdiff
path: root/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h
diff options
context:
space:
mode:
authorsanine <sanine.not@pm.me>2022-10-12 13:26:36 -0500
committersanine <sanine.not@pm.me>2022-10-12 13:26:36 -0500
commitf567ea1e2798fd3156a416e61f083ea3e6b95719 (patch)
tree53b51bda93aadf9dd13fcd77635b8c4b5c813768 /libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h
parent530ffd0b7d3c39757b20f00716e486b5caf89aff (diff)
add pixman and libpngraylib
Diffstat (limited to 'libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h')
-rw-r--r--libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h966
1 files changed, 966 insertions, 0 deletions
diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h
new file mode 100644
index 0000000..da153c3
--- /dev/null
+++ b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h
@@ -0,0 +1,966 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/*
+ * Because the alignment of pixel data to cachelines, and even the number of
+ * cachelines per row can vary from row to row, and because of the need to
+ * preload each scanline once and only once, this prefetch strategy treats
+ * each row of pixels independently. When a pixel row is long enough, there
+ * are three distinct phases of prefetch:
+ * * an inner loop section, where each time a cacheline of data is
+ * processed, another cacheline is preloaded (the exact distance ahead is
+ * determined empirically using profiling results from lowlevel-blt-bench)
+ * * a leading section, where enough cachelines are preloaded to ensure no
+ * cachelines escape being preloaded when the inner loop starts
+ * * a trailing section, where a limited number (0 or more) of cachelines
+ * are preloaded to deal with data (if any) that hangs off the end of the
+ * last iteration of the inner loop, plus any trailing bytes that were not
+ * enough to make up one whole iteration of the inner loop
+ *
+ * There are (in general) three distinct code paths, selected between
+ * depending upon how long the pixel row is. If it is long enough that there
+ * is at least one iteration of the inner loop (as described above) then
+ * this is described as the "wide" case. If it is shorter than that, but
+ * there are still enough bytes output that there is at least one 16-byte-
+ * long, 16-byte-aligned write to the destination (the optimum type of
+ * write), then this is the "medium" case. If it is not even this long, then
+ * this is the "narrow" case, and there is no attempt to align writes to
+ * 16-byte boundaries. In the "medium" and "narrow" cases, all the
+ * cachelines containing data from the pixel row are prefetched up-front.
+ */
+
+/*
+ * Determine whether we put the arguments on the stack for debugging.
+ */
+#undef DEBUG_PARAMS
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY, 0
+.set FLAG_DST_READWRITE, 1
+.set FLAG_COND_EXEC, 0
+.set FLAG_BRANCH_OVER, 2
+.set FLAG_PROCESS_PRESERVES_PSR, 0
+.set FLAG_PROCESS_CORRUPTS_PSR, 4
+.set FLAG_PROCESS_DOESNT_STORE, 0
+.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
+.set FLAG_NO_SPILL_LINE_VARS, 0
+.set FLAG_SPILL_LINE_VARS_WIDE, 16
+.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
+.set FLAG_SPILL_LINE_VARS, 48
+.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
+.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0, 0
+.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST, 0
+.set FLAG_NO_PRELOAD_DST, 256
+
+/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
+
+/*
+ * Offset into stack where mask and source pointer/stride can be accessed.
+ */
+#ifdef DEBUG_PARAMS
+.set ARGS_STACK_OFFSET, (9*4+9*4)
+#else
+.set ARGS_STACK_OFFSET, (9*4)
+#endif
+
+/*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET, 0
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE, 0
+.set PREFETCH_TYPE_STANDARD, 1
+
+/*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+ .if numbytes == 16
+ .if unaligned == 1
+ op&r&cond WK&reg0, [base], #4
+ op&r&cond WK&reg1, [base], #4
+ op&r&cond WK&reg2, [base], #4
+ op&r&cond WK&reg3, [base], #4
+ .else
+ op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ .endif
+ .elseif numbytes == 8
+ .if unaligned == 1
+ op&r&cond WK&reg0, [base], #4
+ op&r&cond WK&reg1, [base], #4
+ .else
+ op&m&cond&ia base!, {WK&reg0,WK&reg1}
+ .endif
+ .elseif numbytes == 4
+ op&r&cond WK&reg0, [base], #4
+ .elseif numbytes == 2
+ op&r&cond&h WK&reg0, [base], #2
+ .elseif numbytes == 1
+ op&r&cond&b WK&reg0, [base], #1
+ .else
+ .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+ .if numbytes == 16
+ stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ .elseif numbytes == 8
+ stm&cond&db base, {WK&reg0,WK&reg1}
+ .elseif numbytes == 4
+ str&cond WK&reg0, [base, #-4]
+ .elseif numbytes == 2
+ str&cond&h WK&reg0, [base, #-2]
+ .elseif numbytes == 1
+ str&cond&b WK&reg0, [base, #-1]
+ .else
+ .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixld cond, numbytes, firstreg, base, unaligned
+ pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+.endm
+
+.macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+ pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .else
+ pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .endif
+.endm
+
+.macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+ a x
+ .endif
+.endm
+
+
+.macro preload_leading_step1 bpp, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if bpp > 0
+ PF bic, ptr, base, #31
+ .set OFFSET, 0
+ .rept prefetch_distance+1
+ PF pld, [ptr, #OFFSET]
+ .set OFFSET, OFFSET+32
+ .endr
+ .endif
+.endm
+
+.macro preload_leading_step2 bpp, bpp_shift, ptr, base
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+ * are the bytes corresponding to the leading pixels more than the amount
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ * inner_loop_offset = (src+leading_bytes)&31
+ * extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+ .if bpp > 0
+ .ifc base,DST
+ /* The test can be simplified further when preloading the destination */
+ PF tst, base, #16
+ PF beq, 61f
+ .else
+ .if bpp/dst_w_bpp == 4
+ PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+ PF and, SCRATCH, SCRATCH, #31
+ PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+ PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
+ PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
+ PF bcs, 61f
+ PF bpl, 60f
+ PF pld, [ptr, #32*(prefetch_distance+2)]
+ .else
+ PF mov, SCRATCH, base, lsl #32-5
+ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF bls, 61f
+ .endif
+ .endif
+60: PF pld, [ptr, #32*(prefetch_distance+1)]
+61:
+ .endif
+.endm
+
+#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+.macro preload_middle bpp, base, scratch_holds_offset
+ .if bpp > 0
+ /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+ .if scratch_holds_offset
+ PF pld, [base, SCRATCH]
+ .else
+ PF bic, SCRATCH, base, #31
+ PF pld, [SCRATCH, #32*prefetch_distance]
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro preload_trailing bpp, bpp_shift, base
+ .if bpp > 0
+ .if bpp*pix_per_block > 256
+ /* Calculations are more complex if more than one fetch per block */
+ PF and, WK1, base, #31
+ PF add, WK1, WK1, WK0, lsl #bpp_shift
+ PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+ PF bic, SCRATCH, base, #31
+80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+ PF add, SCRATCH, SCRATCH, #32
+ PF subs, WK1, WK1, #32
+ PF bhi, 80b
+ .else
+ /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+ PF mov, SCRATCH, base, lsl #32-5
+ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
+ PF adceqs, SCRATCH, SCRATCH, #0
+ /* The instruction above has two effects: ensures Z is only
+ * set if C was clear (so Z indicates that both shifted quantities
+ * were 0), and clears C if Z was set (so C indicates that the sum
+ * of the shifted quantities was greater and not equal to 32) */
+ PF beq, 82f
+ PF bic, SCRATCH, base, #31
+ PF bcc, 81f
+ PF pld, [SCRATCH, #32*(prefetch_distance+2)]
+81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+82:
+ .endif
+ .endif
+.endm
+
+
+.macro preload_line narrow_case, bpp, bpp_shift, base
+/* "narrow_case" - just means that the macro was invoked from the "narrow"
+ * code path rather than the "medium" one - because in the narrow case,
+ * the row of pixels is known to output no more than 30 bytes, then
+ * (assuming the source pixels are no wider than the the destination
+ * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ * meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ * destination) that's being preloaded, or 0 if this channel is not used
+ * for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+ .if bpp > 0
+ .if narrow_case && (bpp <= dst_w_bpp)
+ /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+ PF bic, WK0, base, #31
+ PF pld, [WK0]
+ PF add, WK1, base, X, LSL #bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 90f
+ PF pld, [WK1]
+90:
+ .else
+ PF bic, WK0, base, #31
+ PF pld, [WK0]
+ PF add, WK1, base, X, lsl #bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 92f
+91: PF add, WK0, WK0, #32
+ PF cmp, WK0, WK1
+ PF pld, [WK0]
+ PF bne, 91b
+92:
+ .endif
+ .endif
+.endm
+
+
+.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+ .if decrementx
+ sub&cond X, X, #8*numbytes/dst_w_bpp
+ .endif
+ process_tail cond, numbytes, firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst cond, numbytes, firstreg, DST
+ .endif
+.endm
+
+.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+ .ifc cond,mi
+ bpl 100f
+ .endif
+ .ifc cond,cs
+ bcc 100f
+ .endif
+ .ifc cond,ne
+ beq 100f
+ .endif
+ conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+100:
+ .else
+ conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .endif
+.endm
+
+.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+ /* Can't interleave reads and writes */
+ test
+ conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+ test
+ .endif
+ conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .else
+ /* Can interleave reads and writes for better scheduling */
+ test
+ process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+ process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+ .if decrementx
+ sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+ sub&cond2 X, X, #8*numbytes2/dst_w_bpp
+ .endif
+ process_tail cond1, numbytes1, firstreg1
+ process_tail cond2, numbytes2, firstreg2
+ pixst cond1, numbytes1, firstreg1, DST
+ pixst cond2, numbytes2, firstreg2, DST
+ .endif
+.endm
+
+
+.macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .else
+ movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .endif
+.endm
+
+.macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .else
+ movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .endif
+.endm
+
+.macro leading_15bytes process_head, process_tail
+ /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set DECREMENT_X, 0
+ sub X, X, WK0, lsr #dst_bpp_shift
+ str X, [sp, #LINE_SAVED_REG_COUNT*4]
+ mov X, WK0
+ .endif
+ /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+ conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+ .elseif dst_w_bpp == 16
+ test_bits_1_0_ptr
+ conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+ conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
+ .endif
+.endm
+
+.macro test_bits_3_2_pix
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-3
+.endm
+
+.macro test_bits_1_0_pix
+ .if dst_w_bpp == 8
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+ movs SCRATCH, X, lsr #1
+ .endif
+.endm
+
+.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ .if dst_w_bpp == 16
+ test_bits_1_0_pix
+ conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+ conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ .endif
+.endm
+
+
+.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+ process_head , 16, 0, unaligned_src, unaligned_mask, 1
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle src_bpp, SRC, 1
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle mask_bpp, MASK, 1
+ .else
+ preload_middle src_bpp, SRC, 0
+ preload_middle mask_bpp, MASK, 0
+ .endif
+ .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
+ /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+ * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+ * preloads for, to achieve staggered prefetches for multiple channels, because there are
+ * always two STMs per prefetch, so there is always an opposite STM on which to put the
+ * preload. Note, no need to BIC the base register here */
+ PF pld, [DST, #32*prefetch_distance - dst_alignment]
+ .endif
+ process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ .set SUBBLOCK, SUBBLOCK+1
+ .endr
+ subs X, X, #pix_per_block
+ bhs 110b
+.endm
+
+.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+ /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+ tst DST, #16
+ bne 111f
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+ b 112f
+111:
+ .endif
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+112:
+ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+ PF and, WK0, X, #pix_per_block-1
+ .endif
+ preload_trailing src_bpp, src_bpp_shift, SRC
+ preload_trailing mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_trailing dst_r_bpp, dst_bpp_shift, DST
+ .endif
+ add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+ /* The remainder of the line is handled identically to the medium case */
+ medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+.endm
+
+.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+120:
+ process_head , 16, 0, unaligned_src, unaligned_mask, 0
+ process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ subs X, X, #128/dst_w_bpp
+ bhs 120b
+ /* Trailing pixels */
+ tst X, #128/dst_w_bpp - 1
+ beq exit_label
+ trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+ tst X, #16*8/dst_w_bpp
+ conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+ /* Trailing pixels */
+ /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+ trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+ tst MASK, #3
+ bne 141f
+ .endif
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 140f
+ .endif
+ action process_head, process_tail, process_inner_loop, exit_label, 0, 0
+ .if src_bpp == 8 || src_bpp == 16
+ b exit_label
+140:
+ action process_head, process_tail, process_inner_loop, exit_label, 1, 0
+ .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+ b exit_label
+141:
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 142f
+ .endif
+ action process_head, process_tail, process_inner_loop, exit_label, 0, 1
+ .if src_bpp == 8 || src_bpp == 16
+ b exit_label
+142:
+ action process_head, process_tail, process_inner_loop, exit_label, 1, 1
+ .endif
+ .endif
+.endm
+
+
+.macro end_of_line restore_x, vars_spilled, loop_label, last_one
+ .if vars_spilled
+ /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+ /* This is ldmia sp,{} */
+ .word 0xE89D0000 | LINE_SAVED_REGS
+ .endif
+ subs Y, Y, #1
+ .if vars_spilled
+ .if (LINE_SAVED_REGS) & (1<<1)
+ str Y, [sp]
+ .endif
+ .endif
+ add DST, DST, STRIDE_D
+ .if src_bpp > 0
+ add SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+ add MASK, MASK, STRIDE_M
+ .endif
+ .if restore_x
+ mov X, ORIG_W
+ .endif
+ bhs loop_label
+ .ifc "last_one",""
+ .if vars_spilled
+ b 197f
+ .else
+ b 198f
+ .endif
+ .else
+ .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+ b 198f
+ .endif
+ .endif
+.endm
+
+
+.macro generate_composite_function fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags_, \
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
+
+ pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set flags, flags_
+ .set prefetch_distance, prefetch_distance_
+
+/*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+ .endif
+
+ .if src_bpp == 32
+ .set src_bpp_shift, 2
+ .elseif src_bpp == 24
+ .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+ .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+ .set src_bpp_shift, 0
+ .elseif src_bpp == 0
+ .set src_bpp_shift, -1
+ .else
+ .error "requested src bpp (src_bpp) is not supported"
+ .endif
+
+ .if mask_bpp == 32
+ .set mask_bpp_shift, 2
+ .elseif mask_bpp == 24
+ .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+ .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+ .set mask_bpp_shift, -1
+ .else
+ .error "requested mask bpp (mask_bpp) is not supported"
+ .endif
+
+ .if dst_w_bpp == 32
+ .set dst_bpp_shift, 2
+ .elseif dst_w_bpp == 24
+ .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+ .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+ .set dst_bpp_shift, 0
+ .else
+ .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+ .if (((flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+ .else
+ .set dst_r_bpp, 0
+ .endif
+
+ .set pix_per_block, 16*8/dst_w_bpp
+ .if src_bpp != 0
+ .if 32*8/src_bpp > pix_per_block
+ .set pix_per_block, 32*8/src_bpp
+ .endif
+ .endif
+ .if mask_bpp != 0
+ .if 32*8/mask_bpp > pix_per_block
+ .set pix_per_block, 32*8/mask_bpp
+ .endif
+ .endif
+ .if dst_r_bpp != 0
+ .if 32*8/dst_r_bpp > pix_per_block
+ .set pix_per_block, 32*8/dst_r_bpp
+ .endif
+ .endif
+
+/* The standard entry conditions set up by pixman-arm-common.h are:
+ * r0 = width (pixels)
+ * r1 = height (rows)
+ * r2 = pointer to top-left pixel of destination
+ * r3 = destination stride (pixels)
+ * [sp] = source pixel value, or pointer to top-left pixel of source
+ * [sp,#4] = 0 or source stride (pixels)
+ * The following arguments are unused for non-mask operations
+ * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
+ * [sp,#12] = 0 or mask stride (pixels)
+ */
+
+/*
+ * Assign symbolic names to registers
+ */
+ X .req r0 /* pixels to go on this line */
+ Y .req r1 /* lines to go */
+ DST .req r2 /* destination pixel pointer */
+ STRIDE_D .req r3 /* destination stride (bytes, minus width) */
+ SRC .req r4 /* source pixel pointer */
+ STRIDE_S .req r5 /* source stride (bytes, minus width) */
+ MASK .req r6 /* mask pixel pointer (if applicable) */
+ STRIDE_M .req r7 /* mask stride (bytes, minus width) */
+ WK0 .req r8 /* pixel data registers */
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+ SCRATCH .req r12
+ ORIG_W .req r14 /* width (pixels) */
+
+ push {r4-r11, lr} /* save all registers */
+
+ subs Y, Y, #1
+ blo 199f
+
+#ifdef DEBUG_PARAMS
+ sub sp, sp, #9*4
+#endif
+
+ .if src_bpp > 0
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
+ .endif
+ .if mask_bpp > 0
+ ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
+ ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
+ .endif
+
+#ifdef DEBUG_PARAMS
+ add Y, Y, #1
+ stmia sp, {r0-r7,pc}
+ sub Y, Y, #1
+#endif
+
+ init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ /* Reserve a word in which to store X during leading pixels */
+ sub sp, sp, #4
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
+
+ lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
+ sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
+ .if src_bpp > 0
+ lsl STRIDE_S, #src_bpp_shift
+ sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
+ .endif
+ .if mask_bpp > 0
+ lsl STRIDE_M, #mask_bpp_shift
+ sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
+ .endif
+
+ /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
+ cmp X, #2*16*8/dst_w_bpp - 1
+ blo 170f
+ .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
+ /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
+ cmp X, #(prefetch_distance+3)*pix_per_block - 1
+ blo 160f
+
+ /* Wide case */
+ /* Adjust X so that the decrement instruction can also test for
+ * inner loop termination. We want it to stop when there are
+ * (prefetch_distance+1) complete blocks to go. */
+ sub X, X, #(prefetch_distance+2)*pix_per_block
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .endif
+151: /* New line */
+ newline
+ preload_leading_step1 src_bpp, WK1, SRC
+ preload_leading_step1 mask_bpp, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_leading_step1 dst_r_bpp, WK3, DST
+ .endif
+
+ ands WK0, DST, #15
+ beq 154f
+ rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
+
+ preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
+ preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
+ .endif
+
+ leading_15bytes process_head, process_tail
+
+154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, SRC, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, MASK, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .endif
+ .ifc "process_inner_loop",""
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .else
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ .endif
+
+157: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
+ .endif
+
+ .ltorg
+
+160: /* Medium case */
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .endif
+161: /* New line */
+ newline
+ preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+
+ sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
+ ands WK0, DST, #15
+ beq 164f
+ rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
+
+ leading_15bytes process_head, process_tail
+
+164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+
+167: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if dst_w_bpp < 32
+ mov ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .endif
+171: /* New line */
+ newline
+ preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+
+ .if dst_w_bpp == 8
+ tst DST, #3
+ beq 174f
+172: subs X, X, #1
+ blo 177f
+ process_head , 1, 0, 1, 1, 0
+ process_tail , 1, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 1, 0, DST
+ .endif
+ tst DST, #3
+ bne 172b
+ .elseif dst_w_bpp == 16
+ tst DST, #2
+ beq 174f
+ subs X, X, #1
+ blo 177f
+ process_head , 2, 0, 1, 1, 0
+ process_tail , 2, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 2, 0, DST
+ .endif
+ .endif
+
+174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+
+177: /* Check for another line */
+ end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
+
+197:
+ .if (flags) & FLAG_SPILL_LINE_VARS
+ add sp, sp, #LINE_SAVED_REG_COUNT*4
+ .endif
+198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+ add sp, sp, #4
+ .endif
+
+ cleanup
+
+#ifdef DEBUG_PARAMS
+ add sp, sp, #9*4 /* junk the debug copy of arguments */
+#endif
+199:
+ pop {r4-r11, pc} /* exit */
+
+ .ltorg
+
+ .unreq X
+ .unreq Y
+ .unreq DST
+ .unreq STRIDE_D
+ .unreq SRC
+ .unreq STRIDE_S
+ .unreq MASK
+ .unreq STRIDE_M
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ .unreq SCRATCH
+ .unreq ORIG_W
+ .endfunc
+.endm
+
+.macro line_saved_regs x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+ .irp SAVED_REG,x
+ .ifc "SAVED_REG","Y"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_D"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_S"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_M"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","ORIG_W"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .endr
+.endm
+
+.macro nop_macro x:vararg
+.endm