diff options
Diffstat (limited to 'libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h')
-rw-r--r-- | libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h | 966 |
1 files changed, 0 insertions, 966 deletions
diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h deleted file mode 100644 index da153c3..0000000 --- a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h +++ /dev/null @@ -1,966 +0,0 @@ -/* - * Copyright © 2012 Raspberry Pi Foundation - * Copyright © 2012 RISC OS Open Ltd - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of the copyright holders not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. The copyright holders make no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Ben Avison (bavison@riscosopen.org) - * - */ - -/* - * Because the alignment of pixel data to cachelines, and even the number of - * cachelines per row can vary from row to row, and because of the need to - * preload each scanline once and only once, this prefetch strategy treats - * each row of pixels independently. When a pixel row is long enough, there - * are three distinct phases of prefetch: - * * an inner loop section, where each time a cacheline of data is - * processed, another cacheline is preloaded (the exact distance ahead is - * determined empirically using profiling results from lowlevel-blt-bench) - * * a leading section, where enough cachelines are preloaded to ensure no - * cachelines escape being preloaded when the inner loop starts - * * a trailing section, where a limited number (0 or more) of cachelines - * are preloaded to deal with data (if any) that hangs off the end of the - * last iteration of the inner loop, plus any trailing bytes that were not - * enough to make up one whole iteration of the inner loop - * - * There are (in general) three distinct code paths, selected between - * depending upon how long the pixel row is. If it is long enough that there - * is at least one iteration of the inner loop (as described above) then - * this is described as the "wide" case. If it is shorter than that, but - * there are still enough bytes output that there is at least one 16-byte- - * long, 16-byte-aligned write to the destination (the optimum type of - * write), then this is the "medium" case. If it is not even this long, then - * this is the "narrow" case, and there is no attempt to align writes to - * 16-byte boundaries. In the "medium" and "narrow" cases, all the - * cachelines containing data from the pixel row are prefetched up-front. - */ - -/* - * Determine whether we put the arguments on the stack for debugging. - */ -#undef DEBUG_PARAMS - -/* - * Bit flags for 'generate_composite_function' macro which are used - * to tune generated functions behavior. - */ -.set FLAG_DST_WRITEONLY, 0 -.set FLAG_DST_READWRITE, 1 -.set FLAG_COND_EXEC, 0 -.set FLAG_BRANCH_OVER, 2 -.set FLAG_PROCESS_PRESERVES_PSR, 0 -.set FLAG_PROCESS_CORRUPTS_PSR, 4 -.set FLAG_PROCESS_DOESNT_STORE, 0 -.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ -.set FLAG_NO_SPILL_LINE_VARS, 0 -.set FLAG_SPILL_LINE_VARS_WIDE, 16 -.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 -.set FLAG_SPILL_LINE_VARS, 48 -.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 -.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 -.set FLAG_PROCESS_PRESERVES_WK0, 0 -.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ -.set FLAG_PRELOAD_DST, 0 -.set FLAG_NO_PRELOAD_DST, 256 - -/* - * Number of bytes by which to adjust preload offset of destination - * buffer (allows preload instruction to be moved before the load(s)) - */ -.set DST_PRELOAD_BIAS, 0 - -/* - * Offset into stack where mask and source pointer/stride can be accessed. - */ -#ifdef DEBUG_PARAMS -.set ARGS_STACK_OFFSET, (9*4+9*4) -#else -.set ARGS_STACK_OFFSET, (9*4) -#endif - -/* - * Offset into stack where space allocated during init macro can be accessed. - */ -.set LOCALS_STACK_OFFSET, 0 - -/* - * Constants for selecting preferable prefetch type. - */ -.set PREFETCH_TYPE_NONE, 0 -.set PREFETCH_TYPE_STANDARD, 1 - -/* - * Definitions of macros for load/store of pixel data. - */ - -.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 - .if numbytes == 16 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 - op&r&cond WK®2, [base], #4 - op&r&cond WK®3, [base], #4 - .else - op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} - .endif - .elseif numbytes == 8 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 - .else - op&m&cond&ia base!, {WK®0,WK®1} - .endif - .elseif numbytes == 4 - op&r&cond WK®0, [base], #4 - .elseif numbytes == 2 - op&r&cond&h WK®0, [base], #2 - .elseif numbytes == 1 - op&r&cond&b WK®0, [base], #1 - .else - .error "unsupported size: numbytes" - .endif -.endm - -.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base - .if numbytes == 16 - stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} - .elseif numbytes == 8 - stm&cond&db base, {WK®0,WK®1} - .elseif numbytes == 4 - str&cond WK®0, [base, #-4] - .elseif numbytes == 2 - str&cond&h WK®0, [base, #-2] - .elseif numbytes == 1 - str&cond&b WK®0, [base, #-1] - .else - .error "unsupported size: numbytes" - .endif -.endm - -.macro pixld cond, numbytes, firstreg, base, unaligned - pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned -.endm - -.macro pixst cond, numbytes, firstreg, base - .if (flags) & FLAG_DST_READWRITE - pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base - .else - pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base - .endif -.endm - -.macro PF a, x:vararg - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) - a x - .endif -.endm - - -.macro preload_leading_step1 bpp, ptr, base -/* If the destination is already 16-byte aligned, then we need to preload - * between 0 and prefetch_distance (inclusive) cache lines ahead so there - * are no gaps when the inner loop starts. - */ - .if bpp > 0 - PF bic, ptr, base, #31 - .set OFFSET, 0 - .rept prefetch_distance+1 - PF pld, [ptr, #OFFSET] - .set OFFSET, OFFSET+32 - .endr - .endif -.endm - -.macro preload_leading_step2 bpp, bpp_shift, ptr, base -/* However, if the destination is not 16-byte aligned, we may need to - * preload more cache lines than that. The question we need to ask is: - * are the bytes corresponding to the leading pixels more than the amount - * by which the source pointer will be rounded down for preloading, and if - * so, by how many cache lines? Effectively, we want to calculate - * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp - * inner_loop_offset = (src+leading_bytes)&31 - * extra_needed = leading_bytes - inner_loop_offset - * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only - * possible when there are 4 src bytes for every 1 dst byte). - */ - .if bpp > 0 - .ifc base,DST - /* The test can be simplified further when preloading the destination */ - PF tst, base, #16 - PF beq, 61f - .else - .if bpp/dst_w_bpp == 4 - PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift - PF and, SCRATCH, SCRATCH, #31 - PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift - PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ - PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ - PF bcs, 61f - PF bpl, 60f - PF pld, [ptr, #32*(prefetch_distance+2)] - .else - PF mov, SCRATCH, base, lsl #32-5 - PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift - PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift - PF bls, 61f - .endif - .endif -60: PF pld, [ptr, #32*(prefetch_distance+1)] -61: - .endif -.endm - -#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) -.macro preload_middle bpp, base, scratch_holds_offset - .if bpp > 0 - /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ - .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) - .if scratch_holds_offset - PF pld, [base, SCRATCH] - .else - PF bic, SCRATCH, base, #31 - PF pld, [SCRATCH, #32*prefetch_distance] - .endif - .endif - .endif -.endm - -.macro preload_trailing bpp, bpp_shift, base - .if bpp > 0 - .if bpp*pix_per_block > 256 - /* Calculations are more complex if more than one fetch per block */ - PF and, WK1, base, #31 - PF add, WK1, WK1, WK0, lsl #bpp_shift - PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) - PF bic, SCRATCH, base, #31 -80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] - PF add, SCRATCH, SCRATCH, #32 - PF subs, WK1, WK1, #32 - PF bhi, 80b - .else - /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ - PF mov, SCRATCH, base, lsl #32-5 - PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift - PF adceqs, SCRATCH, SCRATCH, #0 - /* The instruction above has two effects: ensures Z is only - * set if C was clear (so Z indicates that both shifted quantities - * were 0), and clears C if Z was set (so C indicates that the sum - * of the shifted quantities was greater and not equal to 32) */ - PF beq, 82f - PF bic, SCRATCH, base, #31 - PF bcc, 81f - PF pld, [SCRATCH, #32*(prefetch_distance+2)] -81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] -82: - .endif - .endif -.endm - - -.macro preload_line narrow_case, bpp, bpp_shift, base -/* "narrow_case" - just means that the macro was invoked from the "narrow" - * code path rather than the "medium" one - because in the narrow case, - * the row of pixels is known to output no more than 30 bytes, then - * (assuming the source pixels are no wider than the the destination - * pixels) they cannot possibly straddle more than 2 32-byte cachelines, - * meaning there's no need for a loop. - * "bpp" - number of bits per pixel in the channel (source, mask or - * destination) that's being preloaded, or 0 if this channel is not used - * for reading - * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) - * "base" - base address register of channel to preload (SRC, MASK or DST) - */ - .if bpp > 0 - .if narrow_case && (bpp <= dst_w_bpp) - /* In these cases, each line for each channel is in either 1 or 2 cache lines */ - PF bic, WK0, base, #31 - PF pld, [WK0] - PF add, WK1, base, X, LSL #bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 90f - PF pld, [WK1] -90: - .else - PF bic, WK0, base, #31 - PF pld, [WK0] - PF add, WK1, base, X, lsl #bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 92f -91: PF add, WK0, WK0, #32 - PF cmp, WK0, WK1 - PF pld, [WK0] - PF bne, 91b -92: - .endif - .endif -.endm - - -.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond X, X, #8*numbytes/dst_w_bpp - .endif - process_tail cond, numbytes, firstreg - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst cond, numbytes, firstreg, DST - .endif -.endm - -.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - .if (flags) & FLAG_BRANCH_OVER - .ifc cond,mi - bpl 100f - .endif - .ifc cond,cs - bcc 100f - .endif - .ifc cond,ne - beq 100f - .endif - conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx -100: - .else - conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - .endif -.endm - -.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx - .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) - /* Can't interleave reads and writes */ - test - conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx - .if (flags) & FLAG_PROCESS_CORRUPTS_PSR - test - .endif - conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx - .else - /* Can interleave reads and writes for better scheduling */ - test - process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 - process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond1 X, X, #8*numbytes1/dst_w_bpp - sub&cond2 X, X, #8*numbytes2/dst_w_bpp - .endif - process_tail cond1, numbytes1, firstreg1 - process_tail cond2, numbytes2, firstreg2 - pixst cond1, numbytes1, firstreg1, DST - pixst cond2, numbytes2, firstreg2, DST - .endif -.endm - - -.macro test_bits_1_0_ptr - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ - .else - movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ - .endif -.endm - -.macro test_bits_3_2_ptr - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ - .else - movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ - .endif -.endm - -.macro leading_15bytes process_head, process_tail - /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ - .set DECREMENT_X, 1 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set DECREMENT_X, 0 - sub X, X, WK0, lsr #dst_bpp_shift - str X, [sp, #LINE_SAVED_REG_COUNT*4] - mov X, WK0 - .endif - /* Use unaligned loads in all cases for simplicity */ - .if dst_w_bpp == 8 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X - .elseif dst_w_bpp == 16 - test_bits_1_0_ptr - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X - .endif - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - ldr X, [sp, #LINE_SAVED_REG_COUNT*4] - .endif -.endm - -.macro test_bits_3_2_pix - movs SCRATCH, X, lsl #dst_bpp_shift+32-3 -.endm - -.macro test_bits_1_0_pix - .if dst_w_bpp == 8 - movs SCRATCH, X, lsl #dst_bpp_shift+32-1 - .else - movs SCRATCH, X, lsr #1 - .endif -.endm - -.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask - conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 - .if dst_w_bpp == 16 - test_bits_1_0_pix - conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 - .elseif dst_w_bpp == 8 - conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 - .endif -.endm - - -.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment -110: - .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ - .rept pix_per_block*dst_w_bpp/128 - process_head , 16, 0, unaligned_src, unaligned_mask, 1 - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle src_bpp, SRC, 1 - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle mask_bpp, MASK, 1 - .else - preload_middle src_bpp, SRC, 0 - preload_middle mask_bpp, MASK, 0 - .endif - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) - /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that - * destination prefetches are 32-byte aligned. It's also the easiest channel to offset - * preloads for, to achieve staggered prefetches for multiple channels, because there are - * always two STMs per prefetch, so there is always an opposite STM on which to put the - * preload. Note, no need to BIC the base register here */ - PF pld, [DST, #32*prefetch_distance - dst_alignment] - .endif - process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - .set SUBBLOCK, SUBBLOCK+1 - .endr - subs X, X, #pix_per_block - bhs 110b -.endm - -.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask - /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ - .if dst_r_bpp > 0 - tst DST, #16 - bne 111f - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS - b 112f -111: - .endif - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS -112: - /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ - .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) - PF and, WK0, X, #pix_per_block-1 - .endif - preload_trailing src_bpp, src_bpp_shift, SRC - preload_trailing mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_trailing dst_r_bpp, dst_bpp_shift, DST - .endif - add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp - /* The remainder of the line is handled identically to the medium case */ - medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask -.endm - -.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask -120: - process_head , 16, 0, unaligned_src, unaligned_mask, 0 - process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - subs X, X, #128/dst_w_bpp - bhs 120b - /* Trailing pixels */ - tst X, #128/dst_w_bpp - 1 - beq exit_label - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -.endm - -.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask - tst X, #16*8/dst_w_bpp - conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 - /* Trailing pixels */ - /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -.endm - -.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label - /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ - .if mask_bpp == 8 || mask_bpp == 16 - tst MASK, #3 - bne 141f - .endif - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 140f - .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 0 - .if src_bpp == 8 || src_bpp == 16 - b exit_label -140: - action process_head, process_tail, process_inner_loop, exit_label, 1, 0 - .endif - .if mask_bpp == 8 || mask_bpp == 16 - b exit_label -141: - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 142f - .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 1 - .if src_bpp == 8 || src_bpp == 16 - b exit_label -142: - action process_head, process_tail, process_inner_loop, exit_label, 1, 1 - .endif - .endif -.endm - - -.macro end_of_line restore_x, vars_spilled, loop_label, last_one - .if vars_spilled - /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ - /* This is ldmia sp,{} */ - .word 0xE89D0000 | LINE_SAVED_REGS - .endif - subs Y, Y, #1 - .if vars_spilled - .if (LINE_SAVED_REGS) & (1<<1) - str Y, [sp] - .endif - .endif - add DST, DST, STRIDE_D - .if src_bpp > 0 - add SRC, SRC, STRIDE_S - .endif - .if mask_bpp > 0 - add MASK, MASK, STRIDE_M - .endif - .if restore_x - mov X, ORIG_W - .endif - bhs loop_label - .ifc "last_one","" - .if vars_spilled - b 197f - .else - b 198f - .endif - .else - .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) - b 198f - .endif - .endif -.endm - - -.macro generate_composite_function fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags_, \ - prefetch_distance_, \ - init, \ - newline, \ - cleanup, \ - process_head, \ - process_tail, \ - process_inner_loop - - pixman_asm_function fname - -/* - * Make some macro arguments globally visible and accessible - * from other macros - */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set flags, flags_ - .set prefetch_distance, prefetch_distance_ - -/* - * Select prefetch type for this function. - */ - .if prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - .else - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD - .endif - - .if src_bpp == 32 - .set src_bpp_shift, 2 - .elseif src_bpp == 24 - .set src_bpp_shift, 0 - .elseif src_bpp == 16 - .set src_bpp_shift, 1 - .elseif src_bpp == 8 - .set src_bpp_shift, 0 - .elseif src_bpp == 0 - .set src_bpp_shift, -1 - .else - .error "requested src bpp (src_bpp) is not supported" - .endif - - .if mask_bpp == 32 - .set mask_bpp_shift, 2 - .elseif mask_bpp == 24 - .set mask_bpp_shift, 0 - .elseif mask_bpp == 8 - .set mask_bpp_shift, 0 - .elseif mask_bpp == 0 - .set mask_bpp_shift, -1 - .else - .error "requested mask bpp (mask_bpp) is not supported" - .endif - - .if dst_w_bpp == 32 - .set dst_bpp_shift, 2 - .elseif dst_w_bpp == 24 - .set dst_bpp_shift, 0 - .elseif dst_w_bpp == 16 - .set dst_bpp_shift, 1 - .elseif dst_w_bpp == 8 - .set dst_bpp_shift, 0 - .else - .error "requested dst bpp (dst_w_bpp) is not supported" - .endif - - .if (((flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif - - .set pix_per_block, 16*8/dst_w_bpp - .if src_bpp != 0 - .if 32*8/src_bpp > pix_per_block - .set pix_per_block, 32*8/src_bpp - .endif - .endif - .if mask_bpp != 0 - .if 32*8/mask_bpp > pix_per_block - .set pix_per_block, 32*8/mask_bpp - .endif - .endif - .if dst_r_bpp != 0 - .if 32*8/dst_r_bpp > pix_per_block - .set pix_per_block, 32*8/dst_r_bpp - .endif - .endif - -/* The standard entry conditions set up by pixman-arm-common.h are: - * r0 = width (pixels) - * r1 = height (rows) - * r2 = pointer to top-left pixel of destination - * r3 = destination stride (pixels) - * [sp] = source pixel value, or pointer to top-left pixel of source - * [sp,#4] = 0 or source stride (pixels) - * The following arguments are unused for non-mask operations - * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask - * [sp,#12] = 0 or mask stride (pixels) - */ - -/* - * Assign symbolic names to registers - */ - X .req r0 /* pixels to go on this line */ - Y .req r1 /* lines to go */ - DST .req r2 /* destination pixel pointer */ - STRIDE_D .req r3 /* destination stride (bytes, minus width) */ - SRC .req r4 /* source pixel pointer */ - STRIDE_S .req r5 /* source stride (bytes, minus width) */ - MASK .req r6 /* mask pixel pointer (if applicable) */ - STRIDE_M .req r7 /* mask stride (bytes, minus width) */ - WK0 .req r8 /* pixel data registers */ - WK1 .req r9 - WK2 .req r10 - WK3 .req r11 - SCRATCH .req r12 - ORIG_W .req r14 /* width (pixels) */ - - push {r4-r11, lr} /* save all registers */ - - subs Y, Y, #1 - blo 199f - -#ifdef DEBUG_PARAMS - sub sp, sp, #9*4 -#endif - - .if src_bpp > 0 - ldr SRC, [sp, #ARGS_STACK_OFFSET] - ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] - .endif - .if mask_bpp > 0 - ldr MASK, [sp, #ARGS_STACK_OFFSET+8] - ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] - .endif - -#ifdef DEBUG_PARAMS - add Y, Y, #1 - stmia sp, {r0-r7,pc} - sub Y, Y, #1 -#endif - - init - - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - /* Reserve a word in which to store X during leading pixels */ - sub sp, sp, #4 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 - .endif - - lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ - sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift - .if src_bpp > 0 - lsl STRIDE_S, #src_bpp_shift - sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift - .endif - .if mask_bpp > 0 - lsl STRIDE_M, #mask_bpp_shift - sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift - .endif - - /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ - cmp X, #2*16*8/dst_w_bpp - 1 - blo 170f - .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ - /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ - cmp X, #(prefetch_distance+3)*pix_per_block - 1 - blo 160f - - /* Wide case */ - /* Adjust X so that the decrement instruction can also test for - * inner loop termination. We want it to stop when there are - * (prefetch_distance+1) complete blocks to go. */ - sub X, X, #(prefetch_distance+2)*pix_per_block - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif -151: /* New line */ - newline - preload_leading_step1 src_bpp, WK1, SRC - preload_leading_step1 mask_bpp, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step1 dst_r_bpp, WK3, DST - .endif - - ands WK0, DST, #15 - beq 154f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - - preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC - preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST - .endif - - leading_15bytes process_head, process_tail - -154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, SRC, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, MASK, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .endif - .ifc "process_inner_loop","" - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f - .else - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f - .endif - -157: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif - .endif - - .ltorg - -160: /* Medium case */ - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif -161: /* New line */ - newline - preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 0, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 0, dst_r_bpp, dst_bpp_shift, DST - .endif - - sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ - ands WK0, DST, #15 - beq 164f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - - leading_15bytes process_head, process_tail - -164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ - switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f - -167: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b - - .ltorg - -170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ - .if dst_w_bpp < 32 - mov ORIG_W, X - .endif - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .endif -171: /* New line */ - newline - preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 1, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 1, dst_r_bpp, dst_bpp_shift, DST - .endif - - .if dst_w_bpp == 8 - tst DST, #3 - beq 174f -172: subs X, X, #1 - blo 177f - process_head , 1, 0, 1, 1, 0 - process_tail , 1, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 1, 0, DST - .endif - tst DST, #3 - bne 172b - .elseif dst_w_bpp == 16 - tst DST, #2 - beq 174f - subs X, X, #1 - blo 177f - process_head , 2, 0, 1, 1, 0 - process_tail , 2, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 2, 0, DST - .endif - .endif - -174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ - switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f - -177: /* Check for another line */ - end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif - -197: - .if (flags) & FLAG_SPILL_LINE_VARS - add sp, sp, #LINE_SAVED_REG_COUNT*4 - .endif -198: - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 - add sp, sp, #4 - .endif - - cleanup - -#ifdef DEBUG_PARAMS - add sp, sp, #9*4 /* junk the debug copy of arguments */ -#endif -199: - pop {r4-r11, pc} /* exit */ - - .ltorg - - .unreq X - .unreq Y - .unreq DST - .unreq STRIDE_D - .unreq SRC - .unreq STRIDE_S - .unreq MASK - .unreq STRIDE_M - .unreq WK0 - .unreq WK1 - .unreq WK2 - .unreq WK3 - .unreq SCRATCH - .unreq ORIG_W - .endfunc -.endm - -.macro line_saved_regs x:vararg - .set LINE_SAVED_REGS, 0 - .set LINE_SAVED_REG_COUNT, 0 - .irp SAVED_REG,x - .ifc "SAVED_REG","Y" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_D" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_S" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_M" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","ORIG_W" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .endr -.endm - -.macro nop_macro x:vararg -.endm |