diff options
Diffstat (limited to 'libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S')
-rw-r--r-- | libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S | 1179 |
1 files changed, 0 insertions, 1179 deletions
diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S deleted file mode 100644 index a74a0a8..0000000 --- a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S +++ /dev/null @@ -1,1179 +0,0 @@ -/* - * Copyright © 2012 Raspberry Pi Foundation - * Copyright © 2012 RISC OS Open Ltd - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of the copyright holders not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. The copyright holders make no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Ben Avison (bavison@riscosopen.org) - * - */ - -/* Prevent the stack from becoming executable */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .arch armv6 - .object_arch armv4 - .arm - .altmacro - .p2align 2 - -#include "pixman-arm-asm.h" -#include "pixman-arm-simd-asm.h" - -/* A head macro should do all processing which results in an output of up to - * 16 bytes, as far as the final load instruction. The corresponding tail macro - * should complete the processing of the up-to-16 bytes. The calling macro will - * sometimes choose to insert a preload or a decrement of X between them. - * cond ARM condition code for code block - * numbytes Number of output bytes that should be generated this time - * firstreg First WK register in which to place output - * unaligned_src Whether to use non-wordaligned loads of source image - * unaligned_mask Whether to use non-wordaligned loads of mask image - * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output - */ - -.macro blit_init - line_saved_regs STRIDE_D, STRIDE_S -.endm - -.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src -.endm - -.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req MASK - WK7 .req STRIDE_M -110: pixld , 16, 0, SRC, unaligned_src - pixld , 16, 4, SRC, unaligned_src - pld [SRC, SCRATCH] - pixst , 16, 0, DST - pixst , 16, 4, DST - subs X, X, #32*8/src_bpp - bhs 110b - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 4, /* prefetch distance */ \ - blit_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - blit_process_head, \ - nop_macro, /* process tail */ \ - blit_inner_loop - -generate_composite_function \ - pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 4, /* prefetch distance */ \ - blit_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - blit_process_head, \ - nop_macro, /* process tail */ \ - blit_inner_loop - -generate_composite_function \ - pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 3, /* prefetch distance */ \ - blit_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - blit_process_head, \ - nop_macro, /* process tail */ \ - blit_inner_loop - -/******************************************************************************/ - -.macro src_n_8888_init - ldr SRC, [sp, #ARGS_STACK_OFFSET] - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro src_n_0565_init - ldrh SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #16 - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro src_n_8_init - ldrb SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #8 - orr SRC, SRC, lsl #16 - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro fill_process_tail cond, numbytes, firstreg - WK4 .req SRC - WK5 .req STRIDE_S - WK6 .req MASK - WK7 .req STRIDE_M - pixst cond, numbytes, 4, DST - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - src_n_8888_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - fill_process_tail - -generate_composite_function \ - pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - src_n_0565_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - fill_process_tail - -generate_composite_function \ - pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - src_n_8_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - fill_process_tail - -/******************************************************************************/ - -.macro src_x888_8888_pixel, cond, reg - orr&cond WK®, WK®, #0xFF000000 -.endm - -.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src -.endm - -.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg - src_x888_8888_pixel cond, %(firstreg+0) - .if numbytes >= 8 - src_x888_8888_pixel cond, %(firstreg+1) - .if numbytes == 16 - src_x888_8888_pixel cond, %(firstreg+2) - src_x888_8888_pixel cond, %(firstreg+3) - .endif - .endif -.endm - -generate_composite_function \ - pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 3, /* prefetch distance */ \ - nop_macro, /* init */ \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - pixman_composite_src_x888_8888_process_head, \ - pixman_composite_src_x888_8888_process_tail - -/******************************************************************************/ - -.macro src_0565_8888_init - /* Hold loop invariants in MASK and STRIDE_M */ - ldr MASK, =0x07E007E0 - mov STRIDE_M, #0xFF000000 - /* Set GE[3:0] to 1010 so SEL instructions do what we want */ - ldr SCRATCH, =0x80008000 - uadd8 SCRATCH, SCRATCH, SCRATCH -.endm - -.macro src_0565_8888_2pixels, reg1, reg2 - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 - mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG - bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 - orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 - orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 - pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- - sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- - mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg - pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- - sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- - orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb - orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB -.endm - -/* This version doesn't need STRIDE_M, but is one instruction longer. - It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB - mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 - bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb - mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 - mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB - orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB - pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB - sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB - orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb -*/ - -.macro src_0565_8888_1pixel, reg - bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb - and WK®, WK®, MASK @ 000000000000000000000gggggg00000 - mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 - orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 - pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb -.endm - -.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 16 - pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src - .elseif numbytes == 8 - pixld , 4, firstreg, SRC, unaligned_src - .elseif numbytes == 4 - pixld , 2, firstreg, SRC, unaligned_src - .endif -.endm - -.macro src_0565_8888_process_tail cond, numbytes, firstreg - .if numbytes == 16 - src_0565_8888_2pixels firstreg, %(firstreg+1) - src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - src_0565_8888_2pixels firstreg, %(firstreg+1) - .else - src_0565_8888_1pixel firstreg - .endif -.endm - -generate_composite_function \ - pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ - 3, /* prefetch distance */ \ - src_0565_8888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - src_0565_8888_process_head, \ - src_0565_8888_process_tail - -/******************************************************************************/ - -.macro src_x888_0565_init - /* Hold loop invariant in MASK */ - ldr MASK, =0x001F001F - line_saved_regs STRIDE_S, ORIG_W -.endm - -.macro src_x888_0565_1pixel s, d - and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb - and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb - /* Top 16 bits are discarded during the following STRH */ -.endm - -.macro src_x888_0565_2pixels slo, shi, d, tmp - and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 - and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB - and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb - orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB - orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB - and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb - pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb -.endm - -.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_S - WK5 .req STRIDE_M - WK6 .req WK3 - WK7 .req ORIG_W - .if numbytes == 16 - pixld , 16, 4, SRC, 0 - src_x888_0565_2pixels 4, 5, 0, 0 - pixld , 8, 4, SRC, 0 - src_x888_0565_2pixels 6, 7, 1, 1 - pixld , 8, 6, SRC, 0 - .else - pixld , numbytes*2, 4, SRC, 0 - .endif -.endm - -.macro src_x888_0565_process_tail cond, numbytes, firstreg - .if numbytes == 16 - src_x888_0565_2pixels 4, 5, 2, 2 - src_x888_0565_2pixels 6, 7, 3, 4 - .elseif numbytes == 8 - src_x888_0565_2pixels 4, 5, 1, 1 - src_x888_0565_2pixels 6, 7, 2, 2 - .elseif numbytes == 4 - src_x888_0565_2pixels 4, 5, 1, 1 - .else - src_x888_0565_1pixel 4, 1 - .endif - .if numbytes == 16 - pixst , numbytes, 0, DST - .else - pixst , numbytes, 1, DST - .endif - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ - 3, /* prefetch distance */ \ - src_x888_0565_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - src_x888_0565_process_head, \ - src_x888_0565_process_tail - -/******************************************************************************/ - -.macro add_8_8_8pixels cond, dst1, dst2 - uqadd8&cond WK&dst1, WK&dst1, MASK - uqadd8&cond WK&dst2, WK&dst2, STRIDE_M -.endm - -.macro add_8_8_4pixels cond, dst - uqadd8&cond WK&dst, WK&dst, MASK -.endm - -.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req MASK - WK5 .req STRIDE_M - .if numbytes == 16 - pixld cond, 8, 4, SRC, unaligned_src - pixld cond, 16, firstreg, DST, 0 - add_8_8_8pixels cond, firstreg, %(firstreg+1) - pixld cond, 8, 4, SRC, unaligned_src - .else - pixld cond, numbytes, 4, SRC, unaligned_src - pixld cond, numbytes, firstreg, DST, 0 - .endif - .unreq WK4 - .unreq WK5 -.endm - -.macro add_8_8_process_tail cond, numbytes, firstreg - .if numbytes == 16 - add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - add_8_8_8pixels cond, firstreg, %(firstreg+1) - .else - add_8_8_4pixels cond, firstreg - .endif -.endm - -generate_composite_function \ - pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 2, /* prefetch distance */ \ - nop_macro, /* init */ \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - add_8_8_process_head, \ - add_8_8_process_tail - -/******************************************************************************/ - -.macro over_8888_8888_init - /* Hold loop invariant in MASK */ - ldr MASK, =0x00800080 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK - line_saved_regs STRIDE_D, STRIDE_S, ORIG_W -.endm - -.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W - pixld , numbytes, %(4+firstreg), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 - /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ - teq WK®0, #0 - .if numbytes > 4 - teqeq WK®1, #0 - .if numbytes > 8 - teqeq WK®2, #0 - teqeq WK®3, #0 - .endif - .endif -.endm - -.macro over_8888_8888_prepare next - mov WK&next, WK&next, lsr #24 -.endm - -.macro over_8888_8888_1pixel src, dst, offset, next - /* src = destination component multiplier */ - rsb WK&src, WK&src, #255 - /* Split even/odd bytes of dst into SCRATCH/dst */ - uxtb16 SCRATCH, WK&dst - uxtb16 WK&dst, WK&dst, ror #8 - /* Multiply through, adding 0.5 to the upper byte of result for rounding */ - mla SCRATCH, SCRATCH, WK&src, MASK - mla WK&dst, WK&dst, WK&src, MASK - /* Where we would have had a stall between the result of the first MLA and the shifter input, - * reload the complete source pixel */ - ldr WK&src, [SRC, #offset] - /* Multiply by 257/256 to approximate 256/255 */ - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - /* In this stall, start processing the next pixel */ - .if offset < -4 - mov WK&next, WK&next, lsr #24 - .endif - uxtab16 WK&dst, WK&dst, WK&dst, ror #8 - /* Recombine even/odd bytes of multiplied destination */ - mov SCRATCH, SCRATCH, ror #8 - sel WK&dst, SCRATCH, WK&dst - /* Saturated add of source to multiplied destination */ - uqadd8 WK&dst, WK&dst, WK&src -.endm - -.macro over_8888_8888_process_tail cond, numbytes, firstreg - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) - beq 10f - over_8888_8888_prepare %(4+firstreg) - .set PROCESS_REG, firstreg - .set PROCESS_OFF, -numbytes - .rept numbytes / 4 - over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .set PROCESS_OFF, PROCESS_OFF+4 - .endr - pixst , numbytes, firstreg, DST -10: - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ - 2, /* prefetch distance */ \ - over_8888_8888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - over_8888_8888_process_head, \ - over_8888_8888_process_tail - -/******************************************************************************/ - -/* Multiply each byte of a word by a byte. - * Useful when there aren't any obvious ways to fill the stalls with other instructions. - * word Register containing 4 bytes - * byte Register containing byte multiplier (bits 8-31 must be 0) - * tmp Scratch register - * half Register containing the constant 0x00800080 - * GE[3:0] bits must contain 0101 - */ -.macro mul_8888_8 word, byte, tmp, half - /* Split even/odd bytes of word apart */ - uxtb16 tmp, word - uxtb16 word, word, ror #8 - /* Multiply bytes together with rounding, then by 257/256 */ - mla tmp, tmp, byte, half - mla word, word, byte, half /* 1 stall follows */ - uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ - uxtab16 word, word, word, ror #8 - /* Recombine bytes */ - mov tmp, tmp, ror #8 - sel word, tmp, word -.endm - -/******************************************************************************/ - -.macro over_8888_n_8888_init - /* Mask is constant */ - ldr MASK, [sp, #ARGS_STACK_OFFSET+8] - /* Hold loop invariant in STRIDE_M */ - ldr STRIDE_M, =0x00800080 - /* We only want the alpha bits of the constant mask */ - mov MASK, MASK, lsr #24 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, STRIDE_M, STRIDE_M - line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W -.endm - -.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req Y - WK5 .req STRIDE_D - WK6 .req STRIDE_S - WK7 .req ORIG_W - pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -.macro over_8888_n_8888_1pixel src, dst - mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M - sub WK7, WK6, WK&src, lsr #24 - mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M - uqadd8 WK&dst, WK&dst, WK&src -.endm - -.macro over_8888_n_8888_process_tail cond, numbytes, firstreg - WK4 .req Y - WK5 .req STRIDE_D - WK6 .req STRIDE_S - WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) - beq 10f - mov WK6, #255 - .set PROCESS_REG, firstreg - .rept numbytes / 4 - .if numbytes == 16 && PROCESS_REG == 2 - /* We're using WK6 and WK7 as temporaries, so half way through - * 4 pixels, reload the second two source pixels but this time - * into WK4 and WK5 */ - ldmdb SRC, {WK4, WK5} - .endif - over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr - pixst , numbytes, firstreg, DST -10: - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ - 2, /* prefetch distance */ \ - over_8888_n_8888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - over_8888_n_8888_process_head, \ - over_8888_n_8888_process_tail - -/******************************************************************************/ - -.macro over_n_8_8888_init - /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ - ldr SRC, [sp, #ARGS_STACK_OFFSET] - /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ - ldr SCRATCH, =0x00800080 - uxtb16 STRIDE_S, SRC - uxtb16 SRC, SRC, ror #8 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, SCRATCH, SCRATCH - line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W -.endm - -.macro over_n_8_8888_newline - ldr STRIDE_D, =0x00800080 - b 1f - .ltorg -1: -.endm - -.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_M - pixld , numbytes/4, 4, MASK, unaligned_mask - pixld , numbytes, firstreg, DST, 0 - .unreq WK4 -.endm - -.macro over_n_8_8888_1pixel src, dst - uxtb Y, WK4, ror #src*8 - /* Trailing part of multiplication of source */ - mla SCRATCH, STRIDE_S, Y, STRIDE_D - mla Y, SRC, Y, STRIDE_D - mov ORIG_W, #255 - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 Y, Y, Y, ror #8 - mov SCRATCH, SCRATCH, ror #8 - sub ORIG_W, ORIG_W, Y, lsr #24 - sel Y, SCRATCH, Y - /* Then multiply the destination */ - mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D - uqadd8 WK&dst, WK&dst, Y -.endm - -.macro over_n_8_8888_process_tail cond, numbytes, firstreg - WK4 .req STRIDE_M - teq WK4, #0 - beq 10f - .set PROCESS_REG, firstreg - .rept numbytes / 4 - over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr - pixst , numbytes, firstreg, DST -10: - .unreq WK4 -.endm - -generate_composite_function \ - pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ - 2, /* prefetch distance */ \ - over_n_8_8888_init, \ - over_n_8_8888_newline, \ - nop_macro, /* cleanup */ \ - over_n_8_8888_process_head, \ - over_n_8_8888_process_tail - -/******************************************************************************/ - -.macro over_reverse_n_8888_init - ldr SRC, [sp, #ARGS_STACK_OFFSET] - ldr MASK, =0x00800080 - /* Split source pixel into RB/AG parts */ - uxtb16 STRIDE_S, SRC - uxtb16 STRIDE_M, SRC, ror #8 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK - line_saved_regs STRIDE_D, ORIG_W -.endm - -.macro over_reverse_n_8888_newline - mov STRIDE_D, #0xFF -.endm - -.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 -.endm - -.macro over_reverse_n_8888_1pixel d, is_only - teq WK&d, #0 - beq 8f /* replace with source */ - bics ORIG_W, STRIDE_D, WK&d, lsr #24 - .if is_only == 1 - beq 49f /* skip store */ - .else - beq 9f /* write same value back */ - .endif - mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ - mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 - mov SCRATCH, SCRATCH, ror #8 - sel ORIG_W, SCRATCH, ORIG_W - uqadd8 WK&d, WK&d, ORIG_W - b 9f -8: mov WK&d, SRC -9: -.endm - -.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 - over_reverse_n_8888_1pixel reg1, 1 - .else - and SCRATCH, WK®1, WK®2 - .if numbytes == 16 - and SCRATCH, SCRATCH, WK®3 - and SCRATCH, SCRATCH, WK®4 - .endif - mvns SCRATCH, SCRATCH, asr #24 - beq 49f /* skip store if all opaque */ - over_reverse_n_8888_1pixel reg1, 0 - over_reverse_n_8888_1pixel reg2, 0 - .if numbytes == 16 - over_reverse_n_8888_1pixel reg3, 0 - over_reverse_n_8888_1pixel reg4, 0 - .endif - .endif - pixst , numbytes, reg1, DST -49: -.endm - -.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg - over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) -.endm - -generate_composite_function \ - pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ - 3, /* prefetch distance */ \ - over_reverse_n_8888_init, \ - over_reverse_n_8888_newline, \ - nop_macro, /* cleanup */ \ - over_reverse_n_8888_process_head, \ - over_reverse_n_8888_process_tail - -/******************************************************************************/ - -.macro over_white_8888_8888_ca_init - HALF .req SRC - TMP0 .req STRIDE_D - TMP1 .req STRIDE_S - TMP2 .req STRIDE_M - TMP3 .req ORIG_W - WK4 .req SCRATCH - line_saved_regs STRIDE_D, STRIDE_M, ORIG_W - ldr SCRATCH, =0x800080 - mov HALF, #0x80 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, SCRATCH, SCRATCH - .set DST_PRELOAD_BIAS, 8 -.endm - -.macro over_white_8888_8888_ca_cleanup - .set DST_PRELOAD_BIAS, 0 - .unreq HALF - .unreq TMP0 - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq WK4 -.endm - -.macro over_white_8888_8888_ca_combine m, d - uxtb16 TMP1, TMP0 /* rb_notmask */ - uxtb16 TMP2, d /* rb_dest; 1 stall follows */ - smlatt TMP3, TMP2, TMP1, HALF /* red */ - smlabb TMP2, TMP2, TMP1, HALF /* blue */ - uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ - uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ - smlatt d, TMP1, TMP0, HALF /* alpha */ - smlabb TMP1, TMP1, TMP0, HALF /* green */ - pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ - pkhbt TMP1, TMP1, d, lsl #16 /* ag */ - uxtab16 TMP0, TMP0, TMP0, ror #8 - uxtab16 TMP1, TMP1, TMP1, ror #8 - mov TMP0, TMP0, ror #8 - sel d, TMP0, TMP1 - uqadd8 d, d, m /* d is a late result */ -.endm - -.macro over_white_8888_8888_ca_1pixel_head - pixld , 4, 1, MASK, 0 - pixld , 4, 3, DST, 0 -.endm - -.macro over_white_8888_8888_ca_1pixel_tail - mvn TMP0, WK1 - teq WK1, WK1, asr #32 - bne 01f - bcc 03f - mov WK3, WK1 - b 02f -01: over_white_8888_8888_ca_combine WK1, WK3 -02: pixst , 4, 3, DST -03: -.endm - -.macro over_white_8888_8888_ca_2pixels_head - pixld , 8, 1, MASK, 0 -.endm - -.macro over_white_8888_8888_ca_2pixels_tail - pixld , 8, 3, DST - mvn TMP0, WK1 - teq WK1, WK1, asr #32 - bne 01f - movcs WK3, WK1 - bcs 02f - teq WK2, #0 - beq 05f - b 02f -01: over_white_8888_8888_ca_combine WK1, WK3 -02: mvn TMP0, WK2 - teq WK2, WK2, asr #32 - bne 03f - movcs WK4, WK2 - b 04f -03: over_white_8888_8888_ca_combine WK2, WK4 -04: pixst , 8, 3, DST -05: -.endm - -.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 4 - over_white_8888_8888_ca_1pixel_head - .else - .if numbytes == 16 - over_white_8888_8888_ca_2pixels_head - over_white_8888_8888_ca_2pixels_tail - .endif - over_white_8888_8888_ca_2pixels_head - .endif -.endm - -.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg - .if numbytes == 4 - over_white_8888_8888_ca_1pixel_tail - .else - over_white_8888_8888_ca_2pixels_tail - .endif -.endm - -generate_composite_function \ - pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \ - 2, /* prefetch distance */ \ - over_white_8888_8888_ca_init, \ - nop_macro, /* newline */ \ - over_white_8888_8888_ca_cleanup, \ - over_white_8888_8888_ca_process_head, \ - over_white_8888_8888_ca_process_tail - - -.macro over_n_8888_8888_ca_init - /* Set up constants. RB_SRC and AG_SRC are in registers; - * RB_FLDS, A_SRC, and the two HALF values need to go on the - * stack (and the ful SRC value is already there) */ - ldr SCRATCH, [sp, #ARGS_STACK_OFFSET] - mov WK0, #0x00FF0000 - orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */ - mov WK1, #0x80 /* HALF default value */ - mov WK2, SCRATCH, lsr #24 /* A_SRC */ - orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */ - push {WK0-WK3} - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16 - uxtb16 SRC, SCRATCH - uxtb16 STRIDE_S, SCRATCH, ror #8 - - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, WK3, WK3 - - .unreq WK0 - .unreq WK1 - .unreq WK2 - .unreq WK3 - WK0 .req Y - WK1 .req STRIDE_D - RB_SRC .req SRC - AG_SRC .req STRIDE_S - WK2 .req STRIDE_M - RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */ - A_SRC .req r8 - HALF .req r9 - WK3 .req r10 - WK4 .req r11 - WK5 .req SCRATCH - WK6 .req ORIG_W - - line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W -.endm - -.macro over_n_8888_8888_ca_cleanup - add sp, sp, #16 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16 - - .unreq WK0 - .unreq WK1 - .unreq RB_SRC - .unreq AG_SRC - .unreq WK2 - .unreq RB_FLDS - .unreq A_SRC - .unreq HALF - .unreq WK3 - .unreq WK4 - .unreq WK5 - .unreq WK6 - WK0 .req r8 - WK1 .req r9 - WK2 .req r10 - WK3 .req r11 -.endm - -.macro over_n_8888_8888_ca_1pixel_head - pixld , 4, 6, MASK, 0 - pixld , 4, 0, DST, 0 -.endm - -.macro over_n_8888_8888_ca_1pixel_tail - ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8] - uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */ - teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */ - bne 20f - bcc 40f - /* Mask is fully opaque (all channels) */ - ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */ - eors A_SRC, A_SRC, #0xFF - bne 10f - /* Source is also opaque - same as src_8888_8888 */ - mov WK0, WK6 - b 30f -10: /* Same as over_8888_8888 */ - mul_8888_8 WK0, A_SRC, WK5, HALF - uqadd8 WK0, WK0, WK6 - b 30f -20: /* No simplifications possible - do it the hard way */ - uxtb16 WK2, WK6, ror #8 /* ag_mask */ - mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */ - mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */ - ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET] - uxtb16 WK5, WK0 /* rb_dest */ - uxtab16 WK3, WK3, WK3, ror #8 - uxtb16 WK6, WK0, ror #8 /* ag_dest */ - uxtab16 WK4, WK4, WK4, ror #8 - smlatt WK0, RB_SRC, WK1, HALF /* red1 */ - smlabb WK1, RB_SRC, WK1, HALF /* blue1 */ - bic WK3, RB_FLDS, WK3, lsr #8 - bic WK4, RB_FLDS, WK4, lsr #8 - pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */ - smlatt WK0, WK5, WK3, HALF /* red2 */ - smlabb WK3, WK5, WK3, HALF /* blue2 */ - uxtab16 WK1, WK1, WK1, ror #8 - smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */ - pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */ - smlabb WK0, AG_SRC, WK2, HALF /* green1 */ - smlatt WK2, WK6, WK4, HALF /* alpha2 */ - smlabb WK4, WK6, WK4, HALF /* green2 */ - pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */ - uxtab16 WK3, WK3, WK3, ror #8 - pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */ - uxtab16 WK0, WK0, WK0, ror #8 - uxtab16 WK4, WK4, WK4, ror #8 - mov WK1, WK1, ror #8 - mov WK3, WK3, ror #8 - sel WK2, WK1, WK0 /* recombine source*mask */ - sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */ - uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ -30: /* The destination buffer is already in the L1 cache, so - * there's little point in amalgamating writes */ - pixst , 4, 0, DST -40: -.endm - -.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .rept (numbytes / 4) - 1 - over_n_8888_8888_ca_1pixel_head - over_n_8888_8888_ca_1pixel_tail - .endr - over_n_8888_8888_ca_1pixel_head -.endm - -.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg - over_n_8888_8888_ca_1pixel_tail -.endm - -pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 - ldr ip, [sp] - cmp ip, #-1 - beq pixman_composite_over_white_8888_8888_ca_asm_armv6 - /* else drop through... */ - .endfunc -generate_composite_function \ - pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ - 2, /* prefetch distance */ \ - over_n_8888_8888_ca_init, \ - nop_macro, /* newline */ \ - over_n_8888_8888_ca_cleanup, \ - over_n_8888_8888_ca_process_head, \ - over_n_8888_8888_ca_process_tail - -/******************************************************************************/ - -.macro in_reverse_8888_8888_init - /* Hold loop invariant in MASK */ - ldr MASK, =0x00800080 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK - /* Offset the source pointer: we only need the alpha bytes */ - add SRC, SRC, #3 - line_saved_regs ORIG_W -.endm - -.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 - ldrb ORIG_W, [SRC], #4 - .if numbytes >= 8 - ldrb WK®1, [SRC], #4 - .if numbytes == 16 - ldrb WK®2, [SRC], #4 - ldrb WK®3, [SRC], #4 - .endif - .endif - add DST, DST, #numbytes -.endm - -.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) -.endm - -.macro in_reverse_8888_8888_1pixel s, d, offset, is_only - .if is_only != 1 - movs s, ORIG_W - .if offset != 0 - ldrb ORIG_W, [SRC, #offset] - .endif - beq 01f - teq STRIDE_M, #0xFF - beq 02f - .endif - uxtb16 SCRATCH, d /* rb_dest */ - uxtb16 d, d, ror #8 /* ag_dest */ - mla SCRATCH, SCRATCH, s, MASK - mla d, d, s, MASK - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 d, d, d, ror #8 - mov SCRATCH, SCRATCH, ror #8 - sel d, SCRATCH, d - b 02f - .if offset == 0 -48: /* Last mov d,#0 of the set - used as part of shortcut for - * source values all 0 */ - .endif -01: mov d, #0 -02: -.endm - -.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 - teq ORIG_W, ORIG_W, asr #32 - ldrne WK®1, [DST, #-4] - .elseif numbytes == 8 - teq ORIG_W, WK®1 - teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®2} - .else - teq ORIG_W, WK®1 - teqeq ORIG_W, WK®2 - teqeq ORIG_W, WK®3 - teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®4} - .endif - cmnne DST, #0 /* clear C if NE */ - bcs 49f /* no writes to dest if source all -1 */ - beq 48f /* set dest to all 0 if source all 0 */ - .if numbytes == 4 - in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 - str WK®1, [DST, #-4] - .elseif numbytes == 8 - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 - stmdb DST, {WK®1-WK®2} - .else - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 - stmdb DST, {WK®1-WK®4} - .endif -49: -.endm - -.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg - in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) -.endm - -generate_composite_function \ - pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ - 2, /* prefetch distance */ \ - in_reverse_8888_8888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - in_reverse_8888_8888_process_head, \ - in_reverse_8888_8888_process_tail - -/******************************************************************************/ - -.macro over_n_8888_init - ldr SRC, [sp, #ARGS_STACK_OFFSET] - /* Hold loop invariant in MASK */ - ldr MASK, =0x00800080 - /* Hold multiplier for destination in STRIDE_M */ - mov STRIDE_M, #255 - sub STRIDE_M, STRIDE_M, SRC, lsr #24 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK -.endm - -.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 -.endm - -.macro over_n_8888_1pixel dst - mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK - uqadd8 WK&dst, WK&dst, SRC -.endm - -.macro over_n_8888_process_tail cond, numbytes, firstreg - .set PROCESS_REG, firstreg - .rept numbytes / 4 - over_n_8888_1pixel %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr - pixst , numbytes, firstreg, DST -.endm - -generate_composite_function \ - pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ - 2, /* prefetch distance */ \ - over_n_8888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - over_n_8888_process_head, \ - over_n_8888_process_tail - -/******************************************************************************/ |